209 files changed, 15370 insertions, 2601 deletions
diff --git a/fs/9p/9p.c b/fs/9p/9p.c
new file mode 100644
index 00000000000..e847f504a47
--- /dev/null
+++ b/fs/9p/9p.c
@@ -0,0 +1,359 @@
+/*
+ *  linux/fs/9p/9p.c
+ *
+ *  This file contains functions 9P2000 functions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "mux.h"
+
+/**
+ * v9fs_t_version - negotiate protocol parameters with sever
+ * @v9ses: 9P2000 session information
+ * @msize: requested max size packet
+ * @version: requested version.extension string
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+
+int
+v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
+	       char *version, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
+	msg.id = TVERSION;
+	msg.params.tversion.msize = msize;
+	msg.params.tversion.version = version;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_attach - mount the server
+ * @v9ses: 9P2000 session information
+ * @uname: user name doing the attach
+ * @aname: remote name being attached to
+ * @fid: mount fid to attatch to root node
+ * @afid: authentication fid (in this case result key)
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+
+int
+v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
+	      u32 fid, u32 afid, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
+		aname, fid, afid);
+	msg.id = TATTACH;
+	msg.params.tattach.fid = fid;
+	msg.params.tattach.afid = afid;
+	msg.params.tattach.uname = uname;
+	msg.params.tattach.aname = aname;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_clunk - release a fid (finish a transaction)
+ * @v9ses: 9P2000 session information
+ * @fid: fid to release
+ * @fcall: pointer to response fcall pointer
+ *
+ */
+
+int
+v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
+	     struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+	msg.id = TCLUNK;
+	msg.params.tclunk.fid = fid;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_v9fs_t_flush - flush a pending transaction
+ * @v9ses: 9P2000 session information
+ * @tag: tid to release
+ *
+ */
+
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "oldtag %d\n", tag);
+	msg.id = TFLUSH;
+	msg.params.tflush.oldtag = tag;
+	return v9fs_mux_rpc(v9ses, &msg, NULL);
+}
+
+/**
+ * v9fs_t_stat - read a file's meta-data
+ * @v9ses: 9P2000 session information
+ * @fid: fid pointing to file or directory to get info about
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+	if (fcall)
+		*fcall = NULL;
+
+	msg.id = TSTAT;
+	msg.params.tstat.fid = fid;
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_wstat - write a file's meta-data
+ * @v9ses: 9P2000 session information
+ * @fid: fid pointing to file or directory to write info about
+ * @stat: metadata
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
+	     struct v9fs_stat *stat, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length);
+	msg.id = TWSTAT;
+	msg.params.twstat.fid = fid;
+	msg.params.twstat.stat = stat;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_walk - walk a fid to a new file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to walk
+ * @newfid: new fid (for clone operations)
+ * @name: path to walk fid to
+ * @fcall: pointer to response fcall
+ *
+ */
+
+/* TODO: support multiple walk */
+
+int
+v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
+	    char *name, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
+	msg.id = TWALK;
+	msg.params.twalk.fid = fid;
+	msg.params.twalk.newfid = newfid;
+
+	if (name) {
+		msg.params.twalk.nwname = 1;
+		msg.params.twalk.wnames = &name;
+	} else {
+		msg.params.twalk.nwname = 0;
+	}
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_open - open a file
+ *
+ * @v9ses - 9P2000 session information
+ * @fid - fid to open
+ * @mode - mode to open file (R, RW, etc)
+ * @fcall - pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
+	    struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+	long errorno = -1;
+
+	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
+	msg.id = TOPEN;
+	msg.params.topen.fid = fid;
+	msg.params.topen.mode = mode;
+
+	errorno = v9fs_mux_rpc(v9ses, &msg, fcall);
+
+	return errorno;
+}
+
+/**
+ * v9fs_t_remove - remove a file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to remove
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
+	      struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+	msg.id = TREMOVE;
+	msg.params.tremove.fid = fid;
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_create - create a file or directory
+ * @v9ses: 9P2000 session information
+ * @fid: fid to create
+ * @name: name of the file or directory to create
+ * @perm: permissions to create with
+ * @mode: mode to open file (R, RW, etc)
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
+	      u32 perm, u8 mode, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+
+	dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
+		fid, name, perm, mode);
+
+	msg.id = TCREATE;
+	msg.params.tcreate.fid = fid;
+	msg.params.tcreate.name = name;
+	msg.params.tcreate.perm = perm;
+	msg.params.tcreate.mode = mode;
+
+	return v9fs_mux_rpc(v9ses, &msg, fcall);
+}
+
+/**
+ * v9fs_t_read - read data
+ * @v9ses: 9P2000 session information
+ * @fid: fid to read from
+ * @offset: offset to start read at
+ * @count: how many bytes to read
+ * @fcall: pointer to response fcall (with data)
+ *
+ */
+
+int
+v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
+	    u32 count, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+	struct v9fs_fcall *rc = NULL;
+	long errorno = -1;
+
+	dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid,
+		(long unsigned int)offset, count);
+	msg.id = TREAD;
+	msg.params.tread.fid = fid;
+	msg.params.tread.offset = offset;
+	msg.params.tread.count = count;
+	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+
+	if (!errorno) {
+		errorno = rc->params.rread.count;
+		dump_data(rc->params.rread.data, rc->params.rread.count);
+	}
+
+	if (fcall)
+		*fcall = rc;
+	else
+		kfree(rc);
+
+	return errorno;
+}
+
+/**
+ * v9fs_t_write - write data
+ * @v9ses: 9P2000 session information
+ * @fid: fid to write to
+ * @offset: offset to start write at
+ * @count: how many bytes to write
+ * @fcall: pointer to response fcall
+ *
+ */
+
+int
+v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
+	     u64 offset, u32 count, void *data, struct v9fs_fcall **fcall)
+{
+	struct v9fs_fcall msg;
+	struct v9fs_fcall *rc = NULL;
+	long errorno = -1;
+
+	dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid,
+		(unsigned long long)offset, count);
+	dump_data(data, count);
+
+	msg.id = TWRITE;
+	msg.params.twrite.fid = fid;
+	msg.params.twrite.offset = offset;
+	msg.params.twrite.count = count;
+	msg.params.twrite.data = data;
+
+	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+
+	if (!errorno)
+		errorno = rc->params.rwrite.count;
+
+	if (fcall)
+		*fcall = rc;
+	else
+		kfree(rc);
+
+	return errorno;
+}
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
new file mode 100644
index 00000000000..f55424216be
--- /dev/null
+++ b/fs/9p/9p.h
@@ -0,0 +1,341 @@
+/*
+ * linux/fs/9p/9p.h
+ *
+ * 9P protocol definitions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/* Message Types */
+enum {
+	TVERSION = 100,
+	RVERSION,
+	TAUTH = 102,
+	RAUTH,
+	TATTACH = 104,
+	RATTACH,
+	TERROR = 106,
+	RERROR,
+	TFLUSH = 108,
+	RFLUSH,
+	TWALK = 110,
+	RWALK,
+	TOPEN = 112,
+	ROPEN,
+	TCREATE = 114,
+	RCREATE,
+	TREAD = 116,
+	RREAD,
+	TWRITE = 118,
+	RWRITE,
+	TCLUNK = 120,
+	RCLUNK,
+	TREMOVE = 122,
+	RREMOVE,
+	TSTAT = 124,
+	RSTAT,
+	TWSTAT = 126,
+	RWSTAT,
+};
+
+/* modes */
+enum {
+	V9FS_OREAD = 0x00,
+	V9FS_OWRITE = 0x01,
+	V9FS_ORDWR = 0x02,
+	V9FS_OEXEC = 0x03,
+	V9FS_OEXCL = 0x04,
+	V9FS_OTRUNC = 0x10,
+	V9FS_OREXEC = 0x20,
+	V9FS_ORCLOSE = 0x40,
+	V9FS_OAPPEND = 0x80,
+};
+
+/* permissions */
+enum {
+	V9FS_DMDIR = 0x80000000,
+	V9FS_DMAPPEND = 0x40000000,
+	V9FS_DMEXCL = 0x20000000,
+	V9FS_DMMOUNT = 0x10000000,
+	V9FS_DMAUTH = 0x08000000,
+	V9FS_DMTMP = 0x04000000,
+	V9FS_DMSYMLINK = 0x02000000,
+	V9FS_DMLINK = 0x01000000,
+	/* 9P2000.u extensions */
+	V9FS_DMDEVICE = 0x00800000,
+	V9FS_DMNAMEDPIPE = 0x00200000,
+	V9FS_DMSOCKET = 0x00100000,
+	V9FS_DMSETUID = 0x00080000,
+	V9FS_DMSETGID = 0x00040000,
+};
+
+/* qid.types */
+enum {
+	V9FS_QTDIR = 0x80,
+	V9FS_QTAPPEND = 0x40,
+	V9FS_QTEXCL = 0x20,
+	V9FS_QTMOUNT = 0x10,
+	V9FS_QTAUTH = 0x08,
+	V9FS_QTTMP = 0x04,
+	V9FS_QTSYMLINK = 0x02,
+	V9FS_QTLINK = 0x01,
+	V9FS_QTFILE = 0x00,
+};
+
+/* ample room for Twrite/Rread header (iounit) */
+#define V9FS_IOHDRSZ	24
+
+/* qids are the unique ID for a file (like an inode */
+struct v9fs_qid {
+	u8 type;
+	u32 version;
+	u64 path;
+};
+
+/* Plan 9 file metadata (stat) structure */
+struct v9fs_stat {
+	u16 size;
+	u16 type;
+	u32 dev;
+	struct v9fs_qid qid;
+	u32 mode;
+	u32 atime;
+	u32 mtime;
+	u64 length;
+	char *name;
+	char *uid;
+	char *gid;
+	char *muid;
+	char *extension;	/* 9p2000.u extensions */
+	u32 n_uid;		/* 9p2000.u extensions */
+	u32 n_gid;		/* 9p2000.u extensions */
+	u32 n_muid;		/* 9p2000.u extensions */
+	char data[0];
+};
+
+/* Structures for Protocol Operations */
+
+struct Tversion {
+	u32 msize;
+	char *version;
+};
+
+struct Rversion {
+	u32 msize;
+	char *version;
+};
+
+struct Tauth {
+	u32 afid;
+	char *uname;
+	char *aname;
+};
+
+struct Rauth {
+	struct v9fs_qid qid;
+};
+
+struct Rerror {
+	char *error;
+	u32 errno;		/* 9p2000.u extension */
+};
+
+struct Tflush {
+	u32 oldtag;
+};
+
+struct Rflush {
+};
+
+struct Tattach {
+	u32 fid;
+	u32 afid;
+	char *uname;
+	char *aname;
+};
+
+struct Rattach {
+	struct v9fs_qid qid;
+};
+
+struct Twalk {
+	u32 fid;
+	u32 newfid;
+	u32 nwname;
+	char **wnames;
+};
+
+struct Rwalk {
+	u32 nwqid;
+	struct v9fs_qid *wqids;
+};
+
+struct Topen {
+	u32 fid;
+	u8 mode;
+};
+
+struct Ropen {
+	struct v9fs_qid qid;
+	u32 iounit;
+};
+
+struct Tcreate {
+	u32 fid;
+	char *name;
+	u32 perm;
+	u8 mode;
+};
+
+struct Rcreate {
+	struct v9fs_qid qid;
+	u32 iounit;
+};
+
+struct Tread {
+	u32 fid;
+	u64 offset;
+	u32 count;
+};
+
+struct Rread {
+	u32 count;
+	u8 *data;
+};
+
+struct Twrite {
+	u32 fid;
+	u64 offset;
+	u32 count;
+	u8 *data;
+};
+
+struct Rwrite {
+	u32 count;
+};
+
+struct Tclunk {
+	u32 fid;
+};
+
+struct Rclunk {
+};
+
+struct Tremove {
+	u32 fid;
+};
+
+struct Rremove {
+};
+
+struct Tstat {
+	u32 fid;
+};
+
+struct Rstat {
+	struct v9fs_stat *stat;
+};
+
+struct Twstat {
+	u32 fid;
+	struct v9fs_stat *stat;
+};
+
+struct Rwstat {
+};
+
+/*
+  * fcall is the primary packet structure
+  *
+  */
+
+struct v9fs_fcall {
+	u32 size;
+	u8 id;
+	u16 tag;
+
+	union {
+		struct Tversion tversion;
+		struct Rversion rversion;
+		struct Tauth tauth;
+		struct Rauth rauth;
+		struct Rerror rerror;
+		struct Tflush tflush;
+		struct Rflush rflush;
+		struct Tattach tattach;
+		struct Rattach rattach;
+		struct Twalk twalk;
+		struct Rwalk rwalk;
+		struct Topen topen;
+		struct Ropen ropen;
+		struct Tcreate tcreate;
+		struct Rcreate rcreate;
+		struct Tread tread;
+		struct Rread rread;
+		struct Twrite twrite;
+		struct Rwrite rwrite;
+		struct Tclunk tclunk;
+		struct Rclunk rclunk;
+		struct Tremove tremove;
+		struct Rremove rremove;
+		struct Tstat tstat;
+		struct Rstat rstat;
+		struct Twstat twstat;
+		struct Rwstat rwstat;
+	} params;
+};
+
+#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
+
+int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
+		   char *version, struct v9fs_fcall **rcall);
+
+int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
+		  u32 fid, u32 afid, struct v9fs_fcall **rcall);
+
+int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
+		 struct v9fs_fcall **rcall);
+
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag);
+
+int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
+		struct v9fs_fcall **rcall);
+
+int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
+		 struct v9fs_stat *stat, struct v9fs_fcall **rcall);
+
+int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
+		char *name, struct v9fs_fcall **rcall);
+
+int v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
+		struct v9fs_fcall **rcall);
+
+int v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
+		  struct v9fs_fcall **rcall);
+
+int v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
+		  u32 perm, u8 mode, struct v9fs_fcall **rcall);
+
+int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
+		u64 offset, u32 count, struct v9fs_fcall **rcall);
+
+int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
+		 u32 count, void *data, struct v9fs_fcall **rcall);
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
new file mode 100644
index 00000000000..e4e4ffe5a7d
--- /dev/null
+++ b/fs/9p/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_9P_FS) := 9p2000.o
+
+9p2000-objs := \
+	vfs_super.o \
+	vfs_inode.o \
+	vfs_file.o \
+	vfs_dir.o \
+	vfs_dentry.o \
+	error.o \
+	mux.o \
+	trans_fd.o \
+	trans_sock.o \
+	9p.o \
+	conv.o \
+	v9fs.o \
+	fid.o
+
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
new file mode 100644
index 00000000000..1554731bd65
--- /dev/null
+++ b/fs/9p/conv.c
@@ -0,0 +1,693 @@
+/*
+ * linux/fs/9p/conv.c
+ *
+ * 9P protocol conversion functions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "conv.h"
+
+/*
+ * Buffer to help with string parsing
+ */
+struct cbuf {
+	unsigned char *sp;
+	unsigned char *p;
+	unsigned char *ep;
+};
+
+static inline void buf_init(struct cbuf *buf, void *data, int datalen)
+{
+	buf->sp = buf->p = data;
+	buf->ep = data + datalen;
+}
+
+static inline int buf_check_overflow(struct cbuf *buf)
+{
+	return buf->p > buf->ep;
+}
+
+static inline void buf_check_size(struct cbuf *buf, int len)
+{
+	if (buf->p+len > buf->ep) {
+		if (buf->p < buf->ep) {
+			eprintk(KERN_ERR, "buffer overflow\n");
+			buf->p = buf->ep + 1;
+		}
+	}
+}
+
+static inline void *buf_alloc(struct cbuf *buf, int len)
+{
+	void *ret = NULL;
+
+	buf_check_size(buf, len);
+	ret = buf->p;
+	buf->p += len;
+
+	return ret;
+}
+
+static inline void buf_put_int8(struct cbuf *buf, u8 val)
+{
+	buf_check_size(buf, 1);
+
+	buf->p[0] = val;
+	buf->p++;
+}
+
+static inline void buf_put_int16(struct cbuf *buf, u16 val)
+{
+	buf_check_size(buf, 2);
+
+	*(__le16 *) buf->p = cpu_to_le16(val);
+	buf->p += 2;
+}
+
+static inline void buf_put_int32(struct cbuf *buf, u32 val)
+{
+	buf_check_size(buf, 4);
+
+	*(__le32 *)buf->p = cpu_to_le32(val);
+	buf->p += 4;
+}
+
+static inline void buf_put_int64(struct cbuf *buf, u64 val)
+{
+	buf_check_size(buf, 8);
+
+	*(__le64 *)buf->p = cpu_to_le64(val);
+	buf->p += 8;
+}
+
+static inline void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
+{
+	buf_check_size(buf, slen + 2);
+
+	buf_put_int16(buf, slen);
+	memcpy(buf->p, s, slen);
+	buf->p += slen;
+}
+
+static inline void buf_put_string(struct cbuf *buf, const char *s)
+{
+	buf_put_stringn(buf, s, strlen(s));
+}
+
+static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen)
+{
+	buf_check_size(buf, datalen);
+
+	memcpy(buf->p, data, datalen);
+	buf->p += datalen;
+}
+
+static inline u8 buf_get_int8(struct cbuf *buf)
+{
+	u8 ret = 0;
+
+	buf_check_size(buf, 1);
+	ret = buf->p[0];
+
+	buf->p++;
+
+	return ret;
+}
+
+static inline u16 buf_get_int16(struct cbuf *buf)
+{
+	u16 ret = 0;
+
+	buf_check_size(buf, 2);
+	ret = le16_to_cpu(*(__le16 *)buf->p);
+
+	buf->p += 2;
+
+	return ret;
+}
+
+static inline u32 buf_get_int32(struct cbuf *buf)
+{
+	u32 ret = 0;
+
+	buf_check_size(buf, 4);
+	ret = le32_to_cpu(*(__le32 *)buf->p);
+
+	buf->p += 4;
+
+	return ret;
+}
+
+static inline u64 buf_get_int64(struct cbuf *buf)
+{
+	u64 ret = 0;
+
+	buf_check_size(buf, 8);
+	ret = le64_to_cpu(*(__le64 *)buf->p);
+
+	buf->p += 8;
+
+	return ret;
+}
+
+static inline int
+buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
+{
+
+	u16 len = buf_get_int16(buf);
+	buf_check_size(buf, len);
+	if (len + 1 > datalen)
+		return 0;
+
+	memcpy(data, buf->p, len);
+	data[len] = 0;
+	buf->p += len;
+
+	return len + 1;
+}
+
+static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
+{
+	char *ret = NULL;
+	int n = buf_get_string(buf, sbuf->p, sbuf->ep - sbuf->p);
+
+	if (n > 0) {
+		ret = sbuf->p;
+		sbuf->p += n;
+	}
+
+	return ret;
+}
+
+static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
+{
+	buf_check_size(buf, datalen);
+
+	memcpy(data, buf->p, datalen);
+	buf->p += datalen;
+
+	return datalen;
+}
+
+static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
+				  int datalen)
+{
+	char *ret = NULL;
+	int n = 0;
+
+	buf_check_size(dbuf, datalen);
+
+	n = buf_get_data(buf, dbuf->p, datalen);
+
+	if (n > 0) {
+		ret = dbuf->p;
+		dbuf->p += n;
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_size_stat - calculate the size of a variable length stat struct
+ * @v9ses: session information
+ * @stat: metadata (stat) structure
+ *
+ */
+
+static int v9fs_size_stat(struct v9fs_session_info *v9ses,
+			  struct v9fs_stat *stat)
+{
+	int size = 0;
+
+	if (stat == NULL) {
+		eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
+		return 0;
+	}
+
+	size =			/* 2 + *//* size[2] */
+	    2 +			/* type[2] */
+	    4 +			/* dev[4] */
+	    1 +			/* qid.type[1] */
+	    4 +			/* qid.vers[4] */
+	    8 +			/* qid.path[8] */
+	    4 +			/* mode[4] */
+	    4 +			/* atime[4] */
+	    4 +			/* mtime[4] */
+	    8 +			/* length[8] */
+	    8;			/* minimum sum of string lengths */
+
+	if (stat->name)
+		size += strlen(stat->name);
+	if (stat->uid)
+		size += strlen(stat->uid);
+	if (stat->gid)
+		size += strlen(stat->gid);
+	if (stat->muid)
+		size += strlen(stat->muid);
+
+	if (v9ses->extended) {
+		size += 4 +	/* n_uid[4] */
+		    4 +		/* n_gid[4] */
+		    4 +		/* n_muid[4] */
+		    2;		/* string length of extension[4] */
+		if (stat->extension)
+			size += strlen(stat->extension);
+	}
+
+	return size;
+}
+
+/**
+ * serialize_stat - safely format a stat structure for transmission
+ * @v9ses: session info
+ * @stat: metadata (stat) structure
+ * @bufp: buffer to serialize structure into
+ *
+ */
+
+static int
+serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
+	       struct cbuf *bufp)
+{
+	buf_put_int16(bufp, stat->size);
+	buf_put_int16(bufp, stat->type);
+	buf_put_int32(bufp, stat->dev);
+	buf_put_int8(bufp, stat->qid.type);
+	buf_put_int32(bufp, stat->qid.version);
+	buf_put_int64(bufp, stat->qid.path);
+	buf_put_int32(bufp, stat->mode);
+	buf_put_int32(bufp, stat->atime);
+	buf_put_int32(bufp, stat->mtime);
+	buf_put_int64(bufp, stat->length);
+
+	buf_put_string(bufp, stat->name);
+	buf_put_string(bufp, stat->uid);
+	buf_put_string(bufp, stat->gid);
+	buf_put_string(bufp, stat->muid);
+
+	if (v9ses->extended) {
+		buf_put_string(bufp, stat->extension);
+		buf_put_int32(bufp, stat->n_uid);
+		buf_put_int32(bufp, stat->n_gid);
+		buf_put_int32(bufp, stat->n_muid);
+	}
+
+	if (buf_check_overflow(bufp))
+		return 0;
+
+	return stat->size;
+}
+
+/**
+ * deserialize_stat - safely decode a recieved metadata (stat) structure
+ * @v9ses: session info
+ * @bufp: buffer to deserialize
+ * @stat: metadata (stat) structure
+ * @dbufp: buffer to deserialize variable strings into
+ *
+ */
+
+static inline int
+deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
+		 struct v9fs_stat *stat, struct cbuf *dbufp)
+{
+
+	stat->size = buf_get_int16(bufp);
+	stat->type = buf_get_int16(bufp);
+	stat->dev = buf_get_int32(bufp);
+	stat->qid.type = buf_get_int8(bufp);
+	stat->qid.version = buf_get_int32(bufp);
+	stat->qid.path = buf_get_int64(bufp);
+	stat->mode = buf_get_int32(bufp);
+	stat->atime = buf_get_int32(bufp);
+	stat->mtime = buf_get_int32(bufp);
+	stat->length = buf_get_int64(bufp);
+	stat->name = buf_get_stringb(bufp, dbufp);
+	stat->uid = buf_get_stringb(bufp, dbufp);
+	stat->gid = buf_get_stringb(bufp, dbufp);
+	stat->muid = buf_get_stringb(bufp, dbufp);
+
+	if (v9ses->extended) {
+		stat->extension = buf_get_stringb(bufp, dbufp);
+		stat->n_uid = buf_get_int32(bufp);
+		stat->n_gid = buf_get_int32(bufp);
+		stat->n_muid = buf_get_int32(bufp);
+	}
+
+	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+		return 0;
+
+	return stat->size + 2;
+}
+
+/**
+ * deserialize_statb - wrapper for decoding a received metadata structure
+ * @v9ses: session info
+ * @bufp: buffer to deserialize
+ * @dbufp: buffer to deserialize variable strings into
+ *
+ */
+
+static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
+						  *v9ses, struct cbuf *bufp,
+						  struct cbuf *dbufp)
+{
+	struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
+
+	if (ret) {
+		int n = deserialize_stat(v9ses, bufp, ret, dbufp);
+		if (n <= 0)
+			return NULL;
+	}
+
+	return ret;
+}
+
+/**
+ * v9fs_deserialize_stat - decode a received metadata structure
+ * @v9ses: session info
+ * @buf: buffer to deserialize
+ * @buflen: length of received buffer
+ * @stat: metadata structure to decode into
+ * @statlen: length of destination metadata structure
+ *
+ */
+
+int
+v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
+		      u32 buflen, struct v9fs_stat *stat, u32 statlen)
+{
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+	struct cbuf dbuffer;
+	struct cbuf *dbufp = &dbuffer;
+
+	buf_init(bufp, buf, buflen);
+	buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
+		 statlen - sizeof(struct v9fs_stat));
+
+	return deserialize_stat(v9ses, bufp, stat, dbufp);
+}
+
+static inline int
+v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
+{
+	int size = 4 + 1 + 2;	/* size[4] msg[1] tag[2] */
+	int i = 0;
+
+	switch (fcall->id) {
+	default:
+		eprintk(KERN_ERR, "bad msg type %d\n", fcall->id);
+		return 0;
+	case TVERSION:		/* msize[4] version[s] */
+		size += 4 + 2 + strlen(fcall->params.tversion.version);
+		break;
+	case TAUTH:		/* afid[4] uname[s] aname[s] */
+		size += 4 + 2 + strlen(fcall->params.tauth.uname) +
+		    2 + strlen(fcall->params.tauth.aname);
+		break;
+	case TFLUSH:		/* oldtag[2] */
+		size += 2;
+		break;
+	case TATTACH:		/* fid[4] afid[4] uname[s] aname[s] */
+		size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) +
+		    2 + strlen(fcall->params.tattach.aname);
+		break;
+	case TWALK:		/* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
+		size += 4 + 4 + 2;
+		/* now compute total for the array of names */
+		for (i = 0; i < fcall->params.twalk.nwname; i++)
+			size += 2 + strlen(fcall->params.twalk.wnames[i]);
+		break;
+	case TOPEN:		/* fid[4] mode[1] */
+		size += 4 + 1;
+		break;
+	case TCREATE:		/* fid[4] name[s] perm[4] mode[1] */
+		size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1;
+		break;
+	case TREAD:		/* fid[4] offset[8] count[4] */
+		size += 4 + 8 + 4;
+		break;
+	case TWRITE:		/* fid[4] offset[8] count[4] data[count] */
+		size += 4 + 8 + 4 + fcall->params.twrite.count;
+		break;
+	case TCLUNK:		/* fid[4] */
+		size += 4;
+		break;
+	case TREMOVE:		/* fid[4] */
+		size += 4;
+		break;
+	case TSTAT:		/* fid[4] */
+		size += 4;
+		break;
+	case TWSTAT:		/* fid[4] stat[n] */
+		fcall->params.twstat.stat->size =
+		    v9fs_size_stat(v9ses, fcall->params.twstat.stat);
+		size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
+	}
+	return size;
+}
+
+/*
+ * v9fs_serialize_fcall - marshall fcall struct into a packet
+ * @v9ses: session information
+ * @fcall: structure to convert
+ * @data: buffer to serialize fcall into
+ * @datalen: length of buffer to serialize fcall into
+ *
+ */
+
+int
+v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
+		     void *data, u32 datalen)
+{
+	int i = 0;
+	struct v9fs_stat *stat = NULL;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	buf_init(bufp, data, datalen);
+
+	if (!fcall) {
+		eprintk(KERN_ERR, "no fcall\n");
+		return -EINVAL;
+	}
+
+	fcall->size = v9fs_size_fcall(v9ses, fcall);
+
+	buf_put_int32(bufp, fcall->size);
+	buf_put_int8(bufp, fcall->id);
+	buf_put_int16(bufp, fcall->tag);
+
+	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id,
+		fcall->tag);
+
+	/* now encode it */
+	switch (fcall->id) {
+	default:
+		eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id);
+		return -EPROTO;
+	case TVERSION:
+		buf_put_int32(bufp, fcall->params.tversion.msize);
+		buf_put_string(bufp, fcall->params.tversion.version);
+		break;
+	case TAUTH:
+		buf_put_int32(bufp, fcall->params.tauth.afid);
+		buf_put_string(bufp, fcall->params.tauth.uname);
+		buf_put_string(bufp, fcall->params.tauth.aname);
+		break;
+	case TFLUSH:
+		buf_put_int16(bufp, fcall->params.tflush.oldtag);
+		break;
+	case TATTACH:
+		buf_put_int32(bufp, fcall->params.tattach.fid);
+		buf_put_int32(bufp, fcall->params.tattach.afid);
+		buf_put_string(bufp, fcall->params.tattach.uname);
+		buf_put_string(bufp, fcall->params.tattach.aname);
+		break;
+	case TWALK:
+		buf_put_int32(bufp, fcall->params.twalk.fid);
+		buf_put_int32(bufp, fcall->params.twalk.newfid);
+		buf_put_int16(bufp, fcall->params.twalk.nwname);
+		for (i = 0; i < fcall->params.twalk.nwname; i++)
+			buf_put_string(bufp, fcall->params.twalk.wnames[i]);
+		break;
+	case TOPEN:
+		buf_put_int32(bufp, fcall->params.topen.fid);
+		buf_put_int8(bufp, fcall->params.topen.mode);
+		break;
+	case TCREATE:
+		buf_put_int32(bufp, fcall->params.tcreate.fid);
+		buf_put_string(bufp, fcall->params.tcreate.name);
+		buf_put_int32(bufp, fcall->params.tcreate.perm);
+		buf_put_int8(bufp, fcall->params.tcreate.mode);
+		break;
+	case TREAD:
+		buf_put_int32(bufp, fcall->params.tread.fid);
+		buf_put_int64(bufp, fcall->params.tread.offset);
+		buf_put_int32(bufp, fcall->params.tread.count);
+		break;
+	case TWRITE:
+		buf_put_int32(bufp, fcall->params.twrite.fid);
+		buf_put_int64(bufp, fcall->params.twrite.offset);
+		buf_put_int32(bufp, fcall->params.twrite.count);
+		buf_put_data(bufp, fcall->params.twrite.data,
+			     fcall->params.twrite.count);
+		break;
+	case TCLUNK:
+		buf_put_int32(bufp, fcall->params.tclunk.fid);
+		break;
+	case TREMOVE:
+		buf_put_int32(bufp, fcall->params.tremove.fid);
+		break;
+	case TSTAT:
+		buf_put_int32(bufp, fcall->params.tstat.fid);
+		break;
+	case TWSTAT:
+		buf_put_int32(bufp, fcall->params.twstat.fid);
+		stat = fcall->params.twstat.stat;
+
+		buf_put_int16(bufp, stat->size + 2);
+		serialize_stat(v9ses, stat, bufp);
+		break;
+	}
+
+	if (buf_check_overflow(bufp))
+		return -EIO;
+
+	return fcall->size;
+}
+
+/**
+ * deserialize_fcall - unmarshal a response
+ * @v9ses: session information
+ * @msgsize: size of rcall message
+ * @buf: recieved buffer
+ * @buflen: length of received buffer
+ * @rcall: fcall structure to populate
+ * @rcalllen: length of fcall structure to populate
+ *
+ */
+
+int
+v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
+		       void *buf, u32 buflen, struct v9fs_fcall *rcall,
+		       int rcalllen)
+{
+
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+	struct cbuf dbuffer;
+	struct cbuf *dbufp = &dbuffer;
+	int i = 0;
+
+	buf_init(bufp, buf, buflen);
+	buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
+		 rcalllen - sizeof(struct v9fs_fcall));
+
+	rcall->size = msgsize;
+	rcall->id = buf_get_int8(bufp);
+	rcall->tag = buf_get_int16(bufp);
+
+	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
+		rcall->tag);
+	switch (rcall->id) {
+	default:
+		eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
+		return -EPROTO;
+	case RVERSION:
+		rcall->params.rversion.msize = buf_get_int32(bufp);
+		rcall->params.rversion.version = buf_get_stringb(bufp, dbufp);
+		break;
+	case RFLUSH:
+		break;
+	case RATTACH:
+		rcall->params.rattach.qid.type = buf_get_int8(bufp);
+		rcall->params.rattach.qid.version = buf_get_int32(bufp);
+		rcall->params.rattach.qid.path = buf_get_int64(bufp);
+		break;
+	case RWALK:
+		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
+		rcall->params.rwalk.wqids = buf_alloc(bufp,
+		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
+		if (rcall->params.rwalk.wqids)
+			for (i = 0; i < rcall->params.rwalk.nwqid; i++) {
+				rcall->params.rwalk.wqids[i].type =
+				    buf_get_int8(bufp);
+				rcall->params.rwalk.wqids[i].version =
+				    buf_get_int16(bufp);
+				rcall->params.rwalk.wqids[i].path =
+				    buf_get_int64(bufp);
+			}
+		break;
+	case ROPEN:
+		rcall->params.ropen.qid.type = buf_get_int8(bufp);
+		rcall->params.ropen.qid.version = buf_get_int32(bufp);
+		rcall->params.ropen.qid.path = buf_get_int64(bufp);
+		rcall->params.ropen.iounit = buf_get_int32(bufp);
+		break;
+	case RCREATE:
+		rcall->params.rcreate.qid.type = buf_get_int8(bufp);
+		rcall->params.rcreate.qid.version = buf_get_int32(bufp);
+		rcall->params.rcreate.qid.path = buf_get_int64(bufp);
+		rcall->params.rcreate.iounit = buf_get_int32(bufp);
+		break;
+	case RREAD:
+		rcall->params.rread.count = buf_get_int32(bufp);
+		rcall->params.rread.data = buf_get_datab(bufp, dbufp,
+			rcall->params.rread.count);
+		break;
+	case RWRITE:
+		rcall->params.rwrite.count = buf_get_int32(bufp);
+		break;
+	case RCLUNK:
+		break;
+	case RREMOVE:
+		break;
+	case RSTAT:
+		buf_get_int16(bufp);
+		rcall->params.rstat.stat =
+		    deserialize_statb(v9ses, bufp, dbufp);
+		break;
+	case RWSTAT:
+		break;
+	case RERROR:
+		rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
+		if (v9ses->extended)
+			rcall->params.rerror.errno = buf_get_int16(bufp);
+		break;
+	}
+
+	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+		return -EIO;
+
+	return rcall->size;
+}
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
new file mode 100644
index 00000000000..ee849613c61
--- /dev/null
+++ b/fs/9p/conv.h
@@ -0,0 +1,36 @@
+/*
+ * linux/fs/9p/conv.h
+ *
+ * 9P protocol conversion definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf,
+			  u32 buflen, struct v9fs_stat *stat, u32 statlen);
+int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall,
+			 void *buf, u32 buflen);
+int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen,
+			   void *buf, u32 buflen, struct v9fs_fcall *rcall,
+			   int rcalllen);
+
+/* this one is actually in error.c right now */
+int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/debug.h b/fs/9p/debug.h
new file mode 100644
index 00000000000..4445f06919d
--- /dev/null
+++ b/fs/9p/debug.h
@@ -0,0 +1,70 @@
+/*
+ *  linux/fs/9p/debug.h - V9FS Debug Definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#define DEBUG_ERROR		(1<<0)
+#define DEBUG_CURRENT		(1<<1)
+#define DEBUG_9P	        (1<<2)
+#define DEBUG_VFS	        (1<<3)
+#define DEBUG_CONV		(1<<4)
+#define DEBUG_MUX		(1<<5)
+#define DEBUG_TRANS		(1<<6)
+#define DEBUG_SLABS	      	(1<<7)
+
+#define DEBUG_DUMP_PKT		0
+
+extern int v9fs_debug_level;
+
+#define dprintk(level, format, arg...) \
+do {  \
+	if((v9fs_debug_level & level)==level) \
+		printk(KERN_NOTICE "-- %s (%d): " \
+		format , __FUNCTION__, current->pid , ## arg); \
+} while(0)
+
+#define eprintk(level, format, arg...) \
+do { \
+	printk(level "v9fs: %s (%d): " \
+		format , __FUNCTION__, current->pid , ## arg); \
+} while(0)
+
+#if DEBUG_DUMP_PKT
+static inline void dump_data(const unsigned char *data, unsigned int datalen)
+{
+	int i, j;
+	int len = datalen;
+
+	printk(KERN_DEBUG "data ");
+	for (i = 0; i < len; i += 4) {
+		for (j = 0; (j < 4) && (i + j < len); j++)
+			printk(KERN_DEBUG "%02x", data[i + j]);
+		printk(KERN_DEBUG " ");
+	}
+	printk(KERN_DEBUG "\n");
+}
+#else				/* DEBUG_DUMP_PKT */
+static inline void dump_data(const unsigned char *data, unsigned int datalen)
+{
+
+}
+#endif				/* DEBUG_DUMP_PKT */
diff --git a/fs/9p/error.c b/fs/9p/error.c
new file mode 100644
index 00000000000..fee5d19179c
--- /dev/null
+++ b/fs/9p/error.c
@@ -0,0 +1,93 @@
+/*
+ * linux/fs/9p/error.c
+ *
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+#include "debug.h"
+#include "error.h"
+
+/**
+ * v9fs_error_init - preload
+ * @errstr: error string
+ *
+ */
+
+int v9fs_error_init(void)
+{
+	struct errormap *c;
+	int bucket;
+
+	/* initialize hash table */
+	for (bucket = 0; bucket < ERRHASHSZ; bucket++)
+		INIT_HLIST_HEAD(&hash_errmap[bucket]);
+
+	/* load initial error map into hash table */
+	for (c = errmap; c->name != NULL; c++) {
+		bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ;
+		INIT_HLIST_NODE(&c->list);
+		hlist_add_head(&c->list, &hash_errmap[bucket]);
+	}
+
+	return 1;
+}
+
+/**
+ * errstr2errno - convert error string to error number
+ * @errstr: error string
+ *
+ */
+
+int v9fs_errstr2errno(char *errstr)
+{
+	int errno = 0;
+	struct hlist_node *p = NULL;
+	struct errormap *c = NULL;
+	int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ;
+
+	hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
+		if (!strcmp(c->name, errstr)) {
+			errno = c->val;
+			break;
+		}
+	}
+
+	if (errno == 0) {
+		/* TODO: if error isn't found, add it dynamically */
+		printk(KERN_ERR "%s: errstr :%s: not found\n", __FUNCTION__,
+		       errstr);
+		errno = 1;
+	}
+
+	return -errno;
+}
diff --git a/fs/9p/error.h b/fs/9p/error.h
new file mode 100644
index 00000000000..78f89acf7c9
--- /dev/null
+++ b/fs/9p/error.h
@@ -0,0 +1,178 @@
+/*
+ * linux/fs/9p/error.h
+ *
+ * Huge Nasty Error Table
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  This table tries to
+ * match UNIX strings and Plan 9 strings to unix error numbers.  It is used
+ * to preload the dynamic error table which can also track user-specific error
+ * strings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/errno.h>
+#include <asm/errno.h>
+
+struct errormap {
+	char *name;
+	int val;
+
+	struct hlist_node list;
+};
+
+#define ERRHASHSZ		32
+static struct hlist_head hash_errmap[ERRHASHSZ];
+
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+	{"Operation not permitted", EPERM},
+	{"wstat prohibited", EPERM},
+	{"No such file or directory", ENOENT},
+	{"directory entry not found", ENOENT},
+	{"file not found", ENOENT},
+	{"Interrupted system call", EINTR},
+	{"Input/output error", EIO},
+	{"No such device or address", ENXIO},
+	{"Argument list too long", E2BIG},
+	{"Bad file descriptor", EBADF},
+	{"Resource temporarily unavailable", EAGAIN},
+	{"Cannot allocate memory", ENOMEM},
+	{"Permission denied", EACCES},
+	{"Bad address", EFAULT},
+	{"Block device required", ENOTBLK},
+	{"Device or resource busy", EBUSY},
+	{"File exists", EEXIST},
+	{"Invalid cross-device link", EXDEV},
+	{"No such device", ENODEV},
+	{"Not a directory", ENOTDIR},
+	{"Is a directory", EISDIR},
+	{"Invalid argument", EINVAL},
+	{"Too many open files in system", ENFILE},
+	{"Too many open files", EMFILE},
+	{"Text file busy", ETXTBSY},
+	{"File too large", EFBIG},
+	{"No space left on device", ENOSPC},
+	{"Illegal seek", ESPIPE},
+	{"Read-only file system", EROFS},
+	{"Too many links", EMLINK},
+	{"Broken pipe", EPIPE},
+	{"Numerical argument out of domain", EDOM},
+	{"Numerical result out of range", ERANGE},
+	{"Resource deadlock avoided", EDEADLK},
+	{"File name too long", ENAMETOOLONG},
+	{"No locks available", ENOLCK},
+	{"Function not implemented", ENOSYS},
+	{"Directory not empty", ENOTEMPTY},
+	{"Too many levels of symbolic links", ELOOP},
+	{"No message of desired type", ENOMSG},
+	{"Identifier removed", EIDRM},
+	{"No data available", ENODATA},
+	{"Machine is not on the network", ENONET},
+	{"Package not installed", ENOPKG},
+	{"Object is remote", EREMOTE},
+	{"Link has been severed", ENOLINK},
+	{"Communication error on send", ECOMM},
+	{"Protocol error", EPROTO},
+	{"Bad message", EBADMSG},
+	{"File descriptor in bad state", EBADFD},
+	{"Streams pipe error", ESTRPIPE},
+	{"Too many users", EUSERS},
+	{"Socket operation on non-socket", ENOTSOCK},
+	{"Message too long", EMSGSIZE},
+	{"Protocol not available", ENOPROTOOPT},
+	{"Protocol not supported", EPROTONOSUPPORT},
+	{"Socket type not supported", ESOCKTNOSUPPORT},
+	{"Operation not supported", EOPNOTSUPP},
+	{"Protocol family not supported", EPFNOSUPPORT},
+	{"Network is down", ENETDOWN},
+	{"Network is unreachable", ENETUNREACH},
+	{"Network dropped connection on reset", ENETRESET},
+	{"Software caused connection abort", ECONNABORTED},
+	{"Connection reset by peer", ECONNRESET},
+	{"No buffer space available", ENOBUFS},
+	{"Transport endpoint is already connected", EISCONN},
+	{"Transport endpoint is not connected", ENOTCONN},
+	{"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+	{"Connection timed out", ETIMEDOUT},
+	{"Connection refused", ECONNREFUSED},
+	{"Host is down", EHOSTDOWN},
+	{"No route to host", EHOSTUNREACH},
+	{"Operation already in progress", EALREADY},
+	{"Operation now in progress", EINPROGRESS},
+	{"Is a named type file", EISNAM},
+	{"Remote I/O error", EREMOTEIO},
+	{"Disk quota exceeded", EDQUOT},
+/* errors from fossil, vacfs, and u9fs */
+	{"fid unknown or out of range", EBADF},
+	{"permission denied", EACCES},
+	{"file does not exist", ENOENT},
+	{"authentication failed", ECONNREFUSED},
+	{"bad offset in directory read", ESPIPE},
+	{"bad use of fid", EBADF},
+	{"wstat can't convert between files and directories", EPERM},
+	{"directory is not empty", ENOTEMPTY},
+	{"file exists", EEXIST},
+	{"file already exists", EEXIST},
+	{"file or directory already exists", EEXIST},
+	{"fid already in use", EBADF},
+	{"file in use", ETXTBSY},
+	{"i/o error", EIO},
+	{"file already open for I/O", ETXTBSY},
+	{"illegal mode", EINVAL},
+	{"illegal name", ENAMETOOLONG},
+	{"not a directory", ENOTDIR},
+	{"not a member of proposed group", EPERM},
+	{"not owner", EACCES},
+	{"only owner can change group in wstat", EACCES},
+	{"read only file system", EROFS},
+	{"no access to special file", EPERM},
+	{"i/o count too large", EIO},
+	{"unknown group", EINVAL},
+	{"unknown user", EINVAL},
+	{"bogus wstat buffer", EPROTO},
+	{"exclusive use file already open", EAGAIN},
+	{"corrupted directory entry", EIO},
+	{"corrupted file entry", EIO},
+	{"corrupted block label", EIO},
+	{"corrupted meta data", EIO},
+	{"illegal offset", EINVAL},
+	{"illegal path element", ENOENT},
+	{"root of file system is corrupted", EIO},
+	{"corrupted super block", EIO},
+	{"protocol botch", EPROTO},
+	{"file system is full", ENOSPC},
+	{"file is in use", EAGAIN},
+	{"directory entry is not allocated", ENOENT},
+	{"file is read only", EROFS},
+	{"file has been removed", EIDRM},
+	{"only support truncation to zero length", EPERM},
+	{"cannot remove root", EPERM},
+	{"file too big", EFBIG},
+	{"venti i/o error", EIO},
+	/* these are not errors */
+	{"u9fs rhostsauth: no authentication required", 0},
+	{"u9fs authnone: no authentication required", 0},
+	{NULL, -1}
+};
+
+extern int v9fs_error_init(void);
+extern int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
new file mode 100644
index 00000000000..821c9c4d76a
--- /dev/null
+++ b/fs/9p/fid.c
@@ -0,0 +1,241 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "transport.h"
+#include "mux.h"
+#include "conv.h"
+#include "fid.h"
+
+/**
+ * v9fs_fid_insert - add a fid to a dentry
+ * @fid: fid to add
+ * @dentry: dentry that it is being added to
+ *
+ */
+
+static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
+{
+	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
+	dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
+		dentry->d_iname, dentry);
+	if (dentry->d_fsdata == NULL) {
+		dentry->d_fsdata =
+		    kmalloc(sizeof(struct list_head), GFP_KERNEL);
+		if (dentry->d_fsdata == NULL) {
+			dprintk(DEBUG_ERROR, "Out of memory\n");
+			return -ENOMEM;
+		}
+		fid_list = (struct list_head *)dentry->d_fsdata;
+		INIT_LIST_HEAD(fid_list);	/* Initialize list head */
+	}
+
+	fid->uid = current->uid;
+	fid->pid = current->pid;
+	list_add(&fid->list, fid_list);
+	return 0;
+}
+
+/**
+ * v9fs_fid_create - allocate a FID structure
+ * @dentry - dentry to link newly created fid to
+ *
+ */
+
+struct v9fs_fid *v9fs_fid_create(struct dentry *dentry)
+{
+	struct v9fs_fid *new;
+
+	new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
+	if (new == NULL) {
+		dprintk(DEBUG_ERROR, "Out of Memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	new->fid = -1;
+	new->fidopen = 0;
+	new->fidcreate = 0;
+	new->fidclunked = 0;
+	new->iounit = 0;
+
+	if (v9fs_fid_insert(new, dentry) == 0)
+		return new;
+	else {
+		dprintk(DEBUG_ERROR, "Problems inserting to dentry\n");
+		kfree(new);
+		return NULL;
+	}
+}
+
+/**
+ * v9fs_fid_destroy - deallocate a FID structure
+ * @fid: fid to destroy
+ *
+ */
+
+void v9fs_fid_destroy(struct v9fs_fid *fid)
+{
+	list_del(&fid->list);
+	kfree(fid);
+}
+
+/**
+ * v9fs_fid_lookup - retrieve the right fid from a  particular dentry
+ * @dentry: dentry to look for fid in
+ * @type: intent of lookup (operation or traversal)
+ *
+ * search list of fids associated with a dentry for a fid with a matching
+ * thread id or uid.  If that fails, look up the dentry's parents to see if you
+ * can find a matching fid.
+ *
+ */
+
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type)
+{
+	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
+	struct v9fs_fid *current_fid = NULL;
+	struct v9fs_fid *temp = NULL;
+	struct v9fs_fid *return_fid = NULL;
+	int found_parent = 0;
+	int found_user = 0;
+
+	dprintk(DEBUG_9P, " dentry: %s (%p) type %d\n", dentry->d_iname, dentry,
+		type);
+
+	if (fid_list && !list_empty(fid_list)) {
+		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
+			if (current_fid->uid == current->uid) {
+				if (return_fid == NULL) {
+					if ((type == FID_OP)
+					    || (!current_fid->fidopen)) {
+						return_fid = current_fid;
+						found_user = 1;
+					}
+				}
+			}
+			if (current_fid->pid == current->real_parent->pid) {
+				if ((return_fid == NULL) || (found_parent)
+				    || (found_user)) {
+					if ((type == FID_OP)
+					    || (!current_fid->fidopen)) {
+						return_fid = current_fid;
+						found_parent = 1;
+						found_user = 0;
+					}
+				}
+			}
+			if (current_fid->pid == current->pid) {
+				if ((type == FID_OP) ||
+				    (!current_fid->fidopen)) {
+					return_fid = current_fid;
+					found_parent = 0;
+					found_user = 0;
+				}
+			}
+		}
+	}
+
+	/* we are at the root but didn't match */
+	if ((!return_fid) && (dentry->d_parent == dentry)) {
+		/* TODO: clone attach with new uid */
+		return_fid = current_fid;
+	}
+
+	if (!return_fid) {
+		struct dentry *par = current->fs->pwd->d_parent;
+		int count = 1;
+		while (par != NULL) {
+			if (par == dentry)
+				break;
+			count++;
+			if (par == par->d_parent) {
+				dprintk(DEBUG_ERROR,
+					"got to root without finding dentry\n");
+				break;
+			}
+			par = par->d_parent;
+		}
+
+/* XXX - there may be some duplication we can get rid of */
+		if (par == dentry) {
+			/* we need to fid_lookup the starting point */
+			int fidnum = -1;
+			int oldfid = -1;
+			int result = -1;
+			struct v9fs_session_info *v9ses =
+			    v9fs_inode2v9ses(current->fs->pwd->d_inode);
+
+			current_fid =
+			    v9fs_fid_lookup(current->fs->pwd, FID_WALK);
+			if (current_fid == NULL) {
+				dprintk(DEBUG_ERROR,
+					"process cwd doesn't have a fid\n");
+				return return_fid;
+			}
+			oldfid = current_fid->fid;
+			par = current->fs->pwd;
+			/* TODO: take advantage of multiwalk */
+
+			fidnum = v9fs_get_idpool(&v9ses->fidpool);
+			if (fidnum < 0) {
+				dprintk(DEBUG_ERROR,
+					"could not get a new fid num\n");
+				return return_fid;
+			}
+
+			while (par != dentry) {
+				result =
+				    v9fs_t_walk(v9ses, oldfid, fidnum, "..",
+						NULL);
+				if (result < 0) {
+					dprintk(DEBUG_ERROR,
+						"problem walking to parent\n");
+
+					break;
+				}
+				oldfid = fidnum;
+				if (par == par->d_parent) {
+					dprintk(DEBUG_ERROR,
+						"can't find dentry\n");
+					break;
+				}
+				par = par->d_parent;
+			}
+			if (par == dentry) {
+				return_fid = v9fs_fid_create(dentry);
+				return_fid->fid = fidnum;
+			}
+		}
+	}
+
+	return return_fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
new file mode 100644
index 00000000000..7db478ccca3
--- /dev/null
+++ b/fs/9p/fid.h
@@ -0,0 +1,57 @@
+/*
+ * V9FS FID Management
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/list.h>
+
+#define FID_OP   0
+#define FID_WALK 1
+
+struct v9fs_fid {
+	struct list_head list;	 /* list of fids associated with a dentry */
+	struct list_head active; /* XXX - debug */
+
+	u32 fid;
+	unsigned char fidopen;	  /* set when fid is opened */
+	unsigned char fidcreate;  /* set when fid was just created */
+	unsigned char fidclunked; /* set when fid has already been clunked */
+
+	struct v9fs_qid qid;
+	u32 iounit;
+
+	/* readdir stuff */
+	int rdir_fpos;
+	loff_t rdir_pos;
+	struct v9fs_fcall *rdir_fcall;
+
+	/* management stuff */
+	pid_t pid;		/* thread associated with this fid */
+	uid_t uid;		/* user associated with this fid */
+
+	/* private data */
+	struct file *filp;	/* backpointer to File struct for open files */
+	struct v9fs_session_info *v9ses;	/* session info for this FID */
+};
+
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type);
+void v9fs_fid_destroy(struct v9fs_fid *fid);
+struct v9fs_fid *v9fs_fid_create(struct dentry *);
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
new file mode 100644
index 00000000000..8835b576f74
--- /dev/null
+++ b/fs/9p/mux.c
@@ -0,0 +1,475 @@
+/*
+ * linux/fs/9p/mux.c
+ *
+ * Protocol Multiplexer
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kthread.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "transport.h"
+#include "conv.h"
+#include "mux.h"
+
+/**
+ * dprintcond - print condition of session info
+ * @v9ses: session info structure
+ * @req: RPC request structure
+ *
+ */
+
+static inline int
+dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+	dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status,
+		req->rcall);
+	return 0;
+}
+
+/**
+ * xread - force read of a certain number of bytes
+ * @v9ses: session info structure
+ * @ptr: pointer to buffer
+ * @sz: number of bytes to read
+ *
+ * Chuck Cranor CS-533 project1
+ */
+
+static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz)
+{
+	int rd = 0;
+	int ret = 0;
+	while (rd < sz) {
+		ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd);
+		if (ret <= 0) {
+			dprintk(DEBUG_ERROR, "xread errno %d\n", ret);
+			return ret;
+		}
+		rd += ret;
+		ptr += ret;
+	}
+	return (rd);
+}
+
+/**
+ * read_message - read a full 9P2000 fcall packet
+ * @v9ses: session info structure
+ * @rcall: fcall structure to read into
+ * @rcalllen: size of fcall buffer
+ *
+ */
+
+static int
+read_message(struct v9fs_session_info *v9ses,
+	     struct v9fs_fcall *rcall, int rcalllen)
+{
+	unsigned char buf[4];
+	void *data;
+	int size = 0;
+	int res = 0;
+
+	res = xread(v9ses, buf, sizeof(buf));
+	if (res < 0) {
+		dprintk(DEBUG_ERROR,
+			"Reading of count field failed returned: %d\n", res);
+		return res;
+	}
+
+	if (res < 4) {
+		dprintk(DEBUG_ERROR,
+			"Reading of count field failed returned: %d\n", res);
+		return -EIO;
+	}
+
+	size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+	dprintk(DEBUG_MUX, "got a packet count: %d\n", size);
+
+	/* adjust for the four bytes of size */
+	size -= 4;
+
+	if (size > v9ses->maxdata) {
+		dprintk(DEBUG_ERROR, "packet too big: %d\n", size);
+		return -E2BIG;
+	}
+
+	data = kmalloc(size, GFP_KERNEL);
+	if (!data) {
+		eprintk(KERN_WARNING, "out of memory\n");
+		return -ENOMEM;
+	}
+
+	res = xread(v9ses, data, size);
+	if (res < size) {
+		dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n",
+			res);
+		kfree(data);
+		return res;
+	}
+
+	/* we now have an in-memory string that is the reply.
+	 * deserialize it. There is very little to go wrong at this point
+	 * save for v9fs_alloc errors.
+	 */
+	res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata,
+				     rcall, rcalllen);
+
+	kfree(data);
+
+	if (res < 0)
+		return res;
+
+	return 0;
+}
+
+/**
+ * v9fs_recv - receive an RPC response for a particular tag
+ * @v9ses: session info structure
+ * @req: RPC request structure
+ *
+ */
+
+static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+	int ret = 0;
+
+	dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag);
+	ret = wait_event_interruptible(v9ses->read_wait,
+		       ((v9ses->transport->status != Connected) ||
+			(req->rcall != 0) || (req->err < 0) ||
+			dprintcond(v9ses, req)));
+
+	dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall);
+
+	spin_lock(&v9ses->muxlock);
+	list_del(&req->next);
+	spin_unlock(&v9ses->muxlock);
+
+	if (req->err < 0)
+		return req->err;
+
+	if (v9ses->transport->status == Disconnected)
+		return -ECONNRESET;
+
+	return ret;
+}
+
+/**
+ * v9fs_send - send a 9P request
+ * @v9ses: session info structure
+ * @req: RPC request to send
+ *
+ */
+
+static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+{
+	int ret = -1;
+	void *data = NULL;
+	struct v9fs_fcall *tcall = req->tcall;
+
+	data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	tcall->size = 0;	/* enforce size recalculation */
+	ret =
+	    v9fs_serialize_fcall(v9ses, tcall, data,
+				 v9ses->maxdata + V9FS_IOHDRSZ);
+	if (ret < 0)
+		goto free_data;
+
+	spin_lock(&v9ses->muxlock);
+	list_add(&req->next, &v9ses->mux_fcalls);
+	spin_unlock(&v9ses->muxlock);
+
+	dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag,
+		tcall->size);
+	ret = v9ses->transport->write(v9ses->transport, data, tcall->size);
+
+	if (ret != tcall->size) {
+		spin_lock(&v9ses->muxlock);
+		list_del(&req->next);
+		kfree(req->rcall);
+
+		spin_unlock(&v9ses->muxlock);
+		if (ret >= 0)
+			ret = -EREMOTEIO;
+	} else
+		ret = 0;
+
+      free_data:
+	kfree(data);
+	return ret;
+}
+
+/**
+ * v9fs_mux_rpc - send a request, receive a response
+ * @v9ses: session info structure
+ * @tcall: fcall to send
+ * @rcall: buffer to place response into
+ *
+ */
+
+long
+v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
+	     struct v9fs_fcall **rcall)
+{
+	int tid = -1;
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_rpcreq req;
+	int ret = -1;
+
+	if (!v9ses)
+		return -EINVAL;
+
+	if (!v9ses->transport || v9ses->transport->status != Connected)
+		return -EIO;
+
+	if (rcall)
+		*rcall = NULL;
+
+	if (tcall->id != TVERSION) {
+		tid = v9fs_get_idpool(&v9ses->tidpool);
+		if (tid < 0)
+			return -ENOMEM;
+	}
+
+	tcall->tag = tid;
+
+	req.tcall = tcall;
+	req.err = 0;
+	req.rcall = NULL;
+
+	ret = v9fs_send(v9ses, &req);
+
+	if (ret < 0) {
+		if (tcall->id != TVERSION)
+			v9fs_put_idpool(tid, &v9ses->tidpool);
+		dprintk(DEBUG_MUX, "error %d\n", ret);
+		return ret;
+	}
+
+	ret = v9fs_recv(v9ses, &req);
+
+	fcall = req.rcall;
+
+	dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret);
+	if (ret == -ERESTARTSYS) {
+		if (v9ses->transport->status != Disconnected
+		    && tcall->id != TFLUSH) {
+			unsigned long flags;
+
+			dprintk(DEBUG_MUX, "flushing the tag: %d\n",
+				tcall->tag);
+			clear_thread_flag(TIF_SIGPENDING);
+			v9fs_t_flush(v9ses, tcall->tag);
+			spin_lock_irqsave(&current->sighand->siglock, flags);
+			recalc_sigpending();
+			spin_unlock_irqrestore(&current->sighand->siglock,
+					       flags);
+			dprintk(DEBUG_MUX, "flushing done\n");
+		}
+
+		goto release_req;
+	} else if (ret < 0)
+		goto release_req;
+
+	if (!fcall)
+		ret = -EIO;
+	else {
+		if (fcall->id == RERROR) {
+			ret = v9fs_errstr2errno(fcall->params.rerror.error);
+			if (ret == 0) {	/* string match failed */
+				if (fcall->params.rerror.errno)
+					ret = -(fcall->params.rerror.errno);
+				else
+					ret = -ESERVERFAULT;
+			}
+		} else if (fcall->id != tcall->id + 1) {
+			dprintk(DEBUG_ERROR,
+				"fcall mismatch: expected %d, got %d\n",
+				tcall->id + 1, fcall->id);
+			ret = -EIO;
+		}
+	}
+
+      release_req:
+	if (tcall->id != TVERSION)
+		v9fs_put_idpool(tid, &v9ses->tidpool);
+	if (rcall)
+		*rcall = fcall;
+	else
+		kfree(fcall);
+
+	return ret;
+}
+
+/**
+ * v9fs_mux_cancel_requests - cancels all pending requests
+ *
+ * @v9ses: session info structure
+ * @err: error code to return to the requests
+ */
+void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err)
+{
+	struct v9fs_rpcreq *rptr;
+	struct v9fs_rpcreq *rreq;
+
+	dprintk(DEBUG_MUX, " %d\n", err);
+	spin_lock(&v9ses->muxlock);
+	list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+		rreq->err = err;
+	}
+	spin_unlock(&v9ses->muxlock);
+	wake_up_all(&v9ses->read_wait);
+}
+
+/**
+ * v9fs_recvproc - kproc to handle demultiplexing responses
+ * @data: session info structure
+ *
+ */
+
+static int v9fs_recvproc(void *data)
+{
+	struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data;
+	struct v9fs_fcall *rcall = NULL;
+	struct v9fs_rpcreq *rptr;
+	struct v9fs_rpcreq *req;
+	struct v9fs_rpcreq *rreq;
+	int err = 0;
+
+	allow_signal(SIGKILL);
+	set_current_state(TASK_INTERRUPTIBLE);
+	complete(&v9ses->proccmpl);
+	while (!kthread_should_stop() && err >= 0) {
+		req = rptr = rreq = NULL;
+
+		rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
+		if (!rcall) {
+			eprintk(KERN_ERR, "no memory for buffers\n");
+			break;
+		}
+
+		err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ);
+		spin_lock(&v9ses->muxlock);
+		if (err < 0) {
+			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+				rreq->err = err;
+			}
+			if(err != -ERESTARTSYS)
+				eprintk(KERN_ERR,
+					"Transport error while reading message %d\n", err);
+		} else {
+			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
+				if (rreq->tcall->tag == rcall->tag) {
+					req = rreq;
+					req->rcall = rcall;
+					break;
+				}
+			}
+		}
+
+		if (req && (req->tcall->id == TFLUSH)) {
+			struct v9fs_rpcreq *treq = NULL;
+			list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) {
+				if (treq->tcall->tag ==
+				    req->tcall->params.tflush.oldtag) {
+					list_del(&rptr->next);
+					kfree(treq->rcall);
+					break;
+				}
+			}
+		}
+
+		spin_unlock(&v9ses->muxlock);
+
+		if (!req) {
+			if (err >= 0)
+				dprintk(DEBUG_ERROR,
+					"unexpected response: id %d tag %d\n",
+					rcall->id, rcall->tag);
+
+			kfree(rcall);
+		}
+
+		wake_up_all(&v9ses->read_wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	v9ses->transport->close(v9ses->transport);
+
+	/* Inform all pending processes about the failure */
+	wake_up_all(&v9ses->read_wait);
+
+	if (signal_pending(current))
+		complete(&v9ses->proccmpl);
+
+	dprintk(DEBUG_MUX, "recvproc: end\n");
+	v9ses->recvproc = NULL;
+
+	return err >= 0;
+}
+
+/**
+ * v9fs_mux_init - initialize multiplexer (spawn kproc)
+ * @v9ses: session info structure
+ * @dev_name: mount device information (to create unique kproc)
+ *
+ */
+
+int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name)
+{
+	char procname[60];
+
+	strncpy(procname, dev_name, sizeof(procname));
+	procname[sizeof(procname) - 1] = 0;
+
+	init_waitqueue_head(&v9ses->read_wait);
+	init_completion(&v9ses->fcread);
+	init_completion(&v9ses->proccmpl);
+	spin_lock_init(&v9ses->muxlock);
+	INIT_LIST_HEAD(&v9ses->mux_fcalls);
+	v9ses->recvproc = NULL;
+	v9ses->curfcall = NULL;
+
+	v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses,
+					 "v9fs_recvproc %s", procname);
+
+	if (IS_ERR(v9ses->recvproc)) {
+		eprintk(KERN_ERR, "cannot create receiving thread\n");
+		v9fs_session_close(v9ses);
+		return -ECONNABORTED;
+	}
+
+	wake_up_process(v9ses->recvproc);
+	wait_for_completion(&v9ses->proccmpl);
+
+	return 0;
+}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
new file mode 100644
index 00000000000..4994cb10bad
--- /dev/null
+++ b/fs/9p/mux.h
@@ -0,0 +1,41 @@
+/*
+ * linux/fs/9p/mux.h
+ *
+ * Multiplexer Definitions
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/* structure to manage each RPC transaction */
+
+struct v9fs_rpcreq {
+	struct v9fs_fcall *tcall;
+	struct v9fs_fcall *rcall;
+	int err;	/* error code if response failed */
+
+	/* XXX - could we put scatter/gather buffers here? */
+
+	struct list_head next;
+};
+
+int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name);
+long v9fs_mux_rpc(struct v9fs_session_info *v9ses,
+		  struct v9fs_fcall *tcall, struct v9fs_fcall **rcall);
+void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
new file mode 100644
index 00000000000..63b58ce98ff
--- /dev/null
+++ b/fs/9p/trans_fd.c
@@ -0,0 +1,172 @@
+/*
+ * linux/fs/9p/trans_fd.c
+ *
+ * File Descriptor Transport Layer
+ *
+ *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <asm/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "transport.h"
+
+struct v9fs_trans_fd {
+	struct file *in_file;
+	struct file *out_file;
+};
+
+/**
+ * v9fs_fd_recv - receive from a socket
+ * @v9ses: session information
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int v9fs_fd_recv(struct v9fs_transport *trans, void *v, int len)
+{
+	struct v9fs_trans_fd *ts = trans ? trans->priv : NULL;
+
+	if (!trans || trans->status != Connected || !ts)
+		return -EIO;
+
+	return kernel_read(ts->in_file, ts->in_file->f_pos, v, len);
+}
+
+/**
+ * v9fs_fd_send - send to a socket
+ * @v9ses: session information
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len)
+{
+	struct v9fs_trans_fd *ts = trans ? trans->priv : NULL;
+	mm_segment_t oldfs = get_fs();
+	int ret = 0;
+
+	if (!trans || trans->status != Connected || !ts)
+		return -EIO;
+
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+/**
+ * v9fs_fd_init - initialize file descriptor transport
+ * @v9ses: session information
+ * @addr: address of server to mount
+ * @data: mount options
+ *
+ */
+
+static int
+v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
+{
+	struct v9fs_trans_fd *ts = NULL;
+	struct v9fs_transport *trans = v9ses->transport;
+
+	if((v9ses->wfdno == ~0) || (v9ses->rfdno == ~0)) {
+		printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
+		return -ENOPROTOOPT;
+	}
+
+	sema_init(&trans->writelock, 1);
+	sema_init(&trans->readlock, 1);
+
+	ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL);
+
+	if (!ts)
+		return -ENOMEM;
+
+	ts->in_file = fget( v9ses->rfdno );
+	ts->out_file = fget( v9ses->wfdno );
+
+	if (!ts->in_file || !ts->out_file) {
+		if (ts->in_file)
+			fput(ts->in_file);
+
+		if (ts->out_file)
+			fput(ts->out_file);
+
+		kfree(ts);
+		return -EIO;
+	}
+
+	trans->priv = ts;
+	trans->status = Connected;
+
+	return 0;
+}
+
+
+/**
+ * v9fs_fd_close - shutdown file descriptor
+ * @trans: private socket structure
+ *
+ */
+
+static void v9fs_fd_close(struct v9fs_transport *trans)
+{
+	struct v9fs_trans_fd *ts;
+
+	if (!trans)
+		return;
+
+	trans->status = Disconnected;
+	ts = trans->priv;
+
+	if (!ts)
+		return;
+
+	if (ts->in_file)
+		fput(ts->in_file);
+
+	if (ts->out_file)
+		fput(ts->out_file);
+
+	kfree(ts);
+}
+
+struct v9fs_transport v9fs_trans_fd = {
+	.init = v9fs_fd_init,
+	.write = v9fs_fd_send,
+	.read = v9fs_fd_recv,
+	.close = v9fs_fd_close,
+};
+
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
new file mode 100644
index 00000000000..01e26f0013a
--- /dev/null
+++ b/fs/9p/trans_sock.c
@@ -0,0 +1,290 @@
+/*
+ * linux/fs/9p/trans_socket.c
+ *
+ * Socket Transport Layer
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *  Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <asm/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "transport.h"
+
+#define V9FS_PORT 564
+
+struct v9fs_trans_sock {
+	struct socket *s;
+};
+
+/**
+ * v9fs_sock_recv - receive from a socket
+ * @v9ses: session information
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
+{
+	struct msghdr msg;
+	struct kvec iov;
+	int result;
+	mm_segment_t oldfs;
+	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+
+	if (trans->status == Disconnected)
+		return -EREMOTEIO;
+
+	result = -EINVAL;
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+
+	iov.iov_base = v;
+	iov.iov_len = len;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	msg.msg_flags = MSG_NOSIGNAL;
+
+	result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0);
+
+	dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state);
+	set_fs(oldfs);
+
+	if (result <= 0) {
+		if (result != -ERESTARTSYS)
+			trans->status = Disconnected;
+	}
+
+	return result;
+}
+
+/**
+ * v9fs_sock_send - send to a socket
+ * @v9ses: session information
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
+{
+	struct kvec iov;
+	struct msghdr msg;
+	int result = -1;
+	mm_segment_t oldfs;
+	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+
+	dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len);
+	dump_data(v, len);
+
+	down(&trans->writelock);
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	iov.iov_base = v;
+	iov.iov_len = len;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	msg.msg_flags = MSG_NOSIGNAL;
+	result = kernel_sendmsg(ts->s, &msg, &iov, 1, len);
+	set_fs(oldfs);
+
+	if (result < 0) {
+		if (result != -ERESTARTSYS)
+			trans->status = Disconnected;
+	}
+
+	up(&trans->writelock);
+	return result;
+}
+
+/**
+ * v9fs_tcp_init - initialize TCP socket
+ * @v9ses: session information
+ * @addr: address of server to mount
+ * @data: mount options
+ *
+ */
+
+static int
+v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
+{
+	struct socket *csocket = NULL;
+	struct sockaddr_in sin_server;
+	int rc = 0;
+	struct v9fs_trans_sock *ts = NULL;
+	struct v9fs_transport *trans = v9ses->transport;
+
+	sema_init(&trans->writelock, 1);
+	sema_init(&trans->readlock, 1);
+
+	ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
+
+	if (!ts)
+		return -ENOMEM;
+
+	trans->priv = ts;
+	ts->s = NULL;
+
+	if (!addr)
+		return -EINVAL;
+
+	dprintk(DEBUG_TRANS, "Connecting to %s\n", addr);
+
+	sin_server.sin_family = AF_INET;
+	sin_server.sin_addr.s_addr = in_aton(addr);
+	sin_server.sin_port = htons(v9ses->port);
+	sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket);
+	rc = csocket->ops->connect(csocket,
+				   (struct sockaddr *)&sin_server,
+				   sizeof(struct sockaddr_in), 0);
+	if (rc < 0) {
+		eprintk(KERN_ERR,
+			"v9fs_trans_tcp: problem connecting socket to %s\n",
+			addr);
+		return rc;
+	}
+	csocket->sk->sk_allocation = GFP_NOIO;
+	ts->s = csocket;
+	trans->status = Connected;
+
+	return 0;
+}
+
+/**
+ * v9fs_unix_init - initialize UNIX domain socket
+ * @v9ses: session information
+ * @dev_name: path to named pipe
+ * @data: mount options
+ *
+ */
+
+static int
+v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
+	       char *data)
+{
+	int rc;
+	struct socket *csocket;
+	struct sockaddr_un sun_server;
+	struct v9fs_transport *trans;
+	struct v9fs_trans_sock *ts;
+
+	rc = 0;
+	csocket = NULL;
+	trans = v9ses->transport;
+
+	if (strlen(dev_name) > UNIX_PATH_MAX) {
+		eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
+			dev_name);
+		return -ENOMEM;
+	}
+
+	ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	trans->priv = ts;
+	ts->s = NULL;
+
+	sema_init(&trans->writelock, 1);
+	sema_init(&trans->readlock, 1);
+
+	sun_server.sun_family = PF_UNIX;
+	strcpy(sun_server.sun_path, dev_name);
+	sock_create_kern(PF_UNIX, SOCK_STREAM, 0, &csocket);
+	rc = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
+		sizeof(struct sockaddr_un) - 1, 0);	/* -1 *is* important */
+	if (rc < 0) {
+		eprintk(KERN_ERR,
+			"v9fs_trans_unix: problem connecting socket: %s: %d\n",
+			dev_name, rc);
+		return rc;
+	}
+	csocket->sk->sk_allocation = GFP_NOIO;
+	ts->s = csocket;
+	trans->status = Connected;
+
+	return 0;
+}
+
+/**
+ * v9fs_sock_close - shutdown socket
+ * @trans: private socket structure
+ *
+ */
+
+static void v9fs_sock_close(struct v9fs_transport *trans)
+{
+	struct v9fs_trans_sock *ts;
+
+	if (!trans)
+		return;
+
+	ts = trans->priv;
+
+	if ((ts) && (ts->s)) {
+		dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s);
+		sock_release(ts->s);
+		ts->s = NULL;
+		trans->status = Disconnected;
+		dprintk(DEBUG_TRANS, "socket closed\n");
+	}
+
+	if (ts)
+		kfree(ts);
+
+	trans->priv = NULL;
+}
+
+struct v9fs_transport v9fs_trans_tcp = {
+	.init = v9fs_tcp_init,
+	.write = v9fs_sock_send,
+	.read = v9fs_sock_recv,
+	.close = v9fs_sock_close,
+};
+
+struct v9fs_transport v9fs_trans_unix = {
+	.init = v9fs_unix_init,
+	.write = v9fs_sock_send,
+	.read = v9fs_sock_recv,
+	.close = v9fs_sock_close,
+};
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
new file mode 100644
index 00000000000..9e9cd418efd
--- /dev/null
+++ b/fs/9p/transport.h
@@ -0,0 +1,46 @@
+/*
+ * linux/fs/9p/transport.h
+ *
+ * Transport Definition
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+enum v9fs_transport_status {
+	Connected,
+	Disconnected,
+	Hung,
+};
+
+struct v9fs_transport {
+	enum v9fs_transport_status status;
+	struct semaphore writelock;
+	struct semaphore readlock;
+	void *priv;
+
+	int (*init) (struct v9fs_session_info *, const char *, char *);
+	int (*write) (struct v9fs_transport *, void *, int);
+	int (*read) (struct v9fs_transport *, void *, int);
+	void (*close) (struct v9fs_transport *);
+};
+
+extern struct v9fs_transport v9fs_trans_tcp;
+extern struct v9fs_transport v9fs_trans_unix;
+extern struct v9fs_transport v9fs_trans_fd;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
new file mode 100644
index 00000000000..13bdbbab438
--- /dev/null
+++ b/fs/9p/v9fs.c
@@ -0,0 +1,452 @@
+/*
+ *  linux/fs/9p/v9fs.c
+ *
+ *  This file contains functions assisting in mapping VFS to 9P2000
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "transport.h"
+#include "mux.h"
+#include "conv.h"
+
+/* TODO: sysfs or debugfs interface */
+int v9fs_debug_level = 0;	/* feature-rific global debug level  */
+
+/*
+  * Option Parsing (code inspired by NFS code)
+  *
+  */
+
+enum {
+	/* Options that take integer arguments */
+	Opt_port, Opt_msize, Opt_uid, Opt_gid, Opt_afid, Opt_debug,
+	Opt_rfdno, Opt_wfdno,
+	/* String options */
+	Opt_name, Opt_remotename,
+	/* Options that take no arguments */
+	Opt_legacy, Opt_nodevmap, Opt_unix, Opt_tcp, Opt_fd,
+	/* Error token */
+	Opt_err
+};
+
+static match_table_t tokens = {
+	{Opt_port, "port=%u"},
+	{Opt_msize, "msize=%u"},
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_afid, "afid=%u"},
+	{Opt_rfdno, "rfdno=%u"},
+	{Opt_wfdno, "wfdno=%u"},
+	{Opt_debug, "debug=%u"},
+	{Opt_name, "name=%s"},
+	{Opt_remotename, "aname=%s"},
+	{Opt_unix, "proto=unix"},
+	{Opt_tcp, "proto=tcp"},
+	{Opt_fd, "proto=fd"},
+	{Opt_tcp, "tcp"},
+	{Opt_unix, "unix"},
+	{Opt_fd, "fd"},
+	{Opt_legacy, "noextend"},
+	{Opt_nodevmap, "nodevmap"},
+	{Opt_err, NULL}
+};
+
+/*
+ *  Parse option string.
+ */
+
+/**
+ * v9fs_parse_options - parse mount options into session structure
+ * @options: options string passed from mount
+ * @v9ses: existing v9fs session information
+ *
+ */
+
+static void v9fs_parse_options(char *options, struct v9fs_session_info *v9ses)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int ret;
+
+	/* setup defaults */
+	v9ses->port = V9FS_PORT;
+	v9ses->maxdata = 9000;
+	v9ses->proto = PROTO_TCP;
+	v9ses->extended = 1;
+	v9ses->afid = ~0;
+	v9ses->debug = 0;
+	v9ses->rfdno = ~0;
+	v9ses->wfdno = ~0;
+
+	if (!options)
+		return;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if (token < Opt_name) {
+			if ((ret = match_int(&args[0], &option)) < 0) {
+				dprintk(DEBUG_ERROR,
+					"integer field, but no integer?\n");
+				continue;
+			}
+
+		}
+		switch (token) {
+		case Opt_port:
+			v9ses->port = option;
+			break;
+		case Opt_msize:
+			v9ses->maxdata = option;
+			break;
+		case Opt_uid:
+			v9ses->uid = option;
+			break;
+		case Opt_gid:
+			v9ses->gid = option;
+			break;
+		case Opt_afid:
+			v9ses->afid = option;
+			break;
+		case Opt_rfdno:
+			v9ses->rfdno = option;
+			break;
+		case Opt_wfdno:
+			v9ses->wfdno = option;
+			break;
+		case Opt_debug:
+			v9ses->debug = option;
+			break;
+		case Opt_tcp:
+			v9ses->proto = PROTO_TCP;
+			break;
+		case Opt_unix:
+			v9ses->proto = PROTO_UNIX;
+			break;
+		case Opt_fd:
+			v9ses->proto = PROTO_FD;
+			break;
+		case Opt_name:
+			match_strcpy(v9ses->name, &args[0]);
+			break;
+		case Opt_remotename:
+			match_strcpy(v9ses->remotename, &args[0]);
+			break;
+		case Opt_legacy:
+			v9ses->extended = 0;
+			break;
+		case Opt_nodevmap:
+			v9ses->nodev = 1;
+			break;
+		default:
+			continue;
+		}
+	}
+}
+
+/**
+ * v9fs_inode2v9ses - safely extract v9fs session info from super block
+ * @inode: inode to extract information from
+ *
+ * Paranoid function to extract v9ses information from superblock,
+ * if anything is missing it will report an error.
+ *
+ */
+
+struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
+{
+	return (inode->i_sb->s_fs_info);
+}
+
+/**
+ * v9fs_get_idpool - allocate numeric id from pool
+ * @p - pool to allocate from
+ *
+ * XXX - This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+
+int v9fs_get_idpool(struct v9fs_idpool *p)
+{
+	int i = 0;
+	int error;
+
+retry:
+	if (idr_pre_get(&p->pool, GFP_KERNEL) == 0)
+		return 0;
+
+	if (down_interruptible(&p->lock) == -EINTR) {
+		eprintk(KERN_WARNING, "Interrupted while locking\n");
+		return -1;
+	}
+
+	error = idr_get_new(&p->pool, NULL, &i);
+	up(&p->lock);
+
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error)
+		return -1;
+
+	return i;
+}
+
+/**
+ * v9fs_put_idpool - release numeric id from pool
+ * @p - pool to allocate from
+ *
+ * XXX - This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+
+void v9fs_put_idpool(int id, struct v9fs_idpool *p)
+{
+	if (down_interruptible(&p->lock) == -EINTR) {
+		eprintk(KERN_WARNING, "Interrupted while locking\n");
+		return;
+	}
+	idr_remove(&p->pool, id);
+	up(&p->lock);
+}
+
+/**
+ * v9fs_session_init - initialize session
+ * @v9ses: session information structure
+ * @dev_name: device being mounted
+ * @data: options
+ *
+ */
+
+int
+v9fs_session_init(struct v9fs_session_info *v9ses,
+		  const char *dev_name, char *data)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_transport *trans_proto;
+	int n = 0;
+	int newfid = -1;
+	int retval = -EINVAL;
+
+	v9ses->name = __getname();
+	if (!v9ses->name)
+		return -ENOMEM;
+
+	v9ses->remotename = __getname();
+	if (!v9ses->remotename) {
+		putname(v9ses->name);
+		return -ENOMEM;
+	}
+
+	strcpy(v9ses->name, V9FS_DEFUSER);
+	strcpy(v9ses->remotename, V9FS_DEFANAME);
+
+	v9fs_parse_options(data, v9ses);
+
+	/* set global debug level */
+	v9fs_debug_level = v9ses->debug;
+
+	/* id pools that are session-dependent: FIDs and TIDs */
+	idr_init(&v9ses->fidpool.pool);
+	init_MUTEX(&v9ses->fidpool.lock);
+	idr_init(&v9ses->tidpool.pool);
+	init_MUTEX(&v9ses->tidpool.lock);
+
+
+	switch (v9ses->proto) {
+	case PROTO_TCP:
+		trans_proto = &v9fs_trans_tcp;
+		break;
+	case PROTO_UNIX:
+		trans_proto = &v9fs_trans_unix;
+		*v9ses->remotename = 0;
+		break;
+	case PROTO_FD:
+		trans_proto = &v9fs_trans_fd;
+		*v9ses->remotename = 0;
+		break;
+	default:
+		printk(KERN_ERR "v9fs: Bad mount protocol %d\n", v9ses->proto);
+		retval = -ENOPROTOOPT;
+		goto SessCleanUp;
+	};
+
+	v9ses->transport = trans_proto;
+
+	if ((retval = v9ses->transport->init(v9ses, dev_name, data)) < 0) {
+		eprintk(KERN_ERR, "problem initializing transport\n");
+		goto SessCleanUp;
+	}
+
+	v9ses->inprogress = 0;
+	v9ses->shutdown = 0;
+	v9ses->session_hung = 0;
+
+	if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) {
+		dprintk(DEBUG_ERROR, "problem initializing mux\n");
+		goto SessCleanUp;
+	}
+
+	if (v9ses->afid == ~0) {
+		if (v9ses->extended)
+			retval =
+			    v9fs_t_version(v9ses, v9ses->maxdata, "9P2000.u",
+					   &fcall);
+		else
+			retval = v9fs_t_version(v9ses, v9ses->maxdata, "9P2000",
+						&fcall);
+
+		if (retval < 0) {
+			dprintk(DEBUG_ERROR, "v9fs_t_version failed\n");
+			goto FreeFcall;
+		}
+
+		/* Really should check for 9P1 and report error */
+		if (!strcmp(fcall->params.rversion.version, "9P2000.u")) {
+			dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
+			v9ses->extended = 1;
+		} else {
+			dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
+			v9ses->extended = 0;
+		}
+
+		n = fcall->params.rversion.msize;
+		kfree(fcall);
+
+		if (n < v9ses->maxdata)
+			v9ses->maxdata = n;
+	}
+
+	newfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "couldn't allocate FID\n");
+		retval = -ENOMEM;
+		goto SessCleanUp;
+	}
+	/* it is a little bit ugly, but we have to prevent newfid */
+	/* being the same as afid, so if it is, get a new fid     */
+	if (v9ses->afid != ~0 && newfid == v9ses->afid) {
+		newfid = v9fs_get_idpool(&v9ses->fidpool);
+		if (newfid < 0) {
+			eprintk(KERN_WARNING, "couldn't allocate FID\n");
+			retval = -ENOMEM;
+			goto SessCleanUp;
+		}
+	}
+
+	if ((retval =
+	     v9fs_t_attach(v9ses, v9ses->name, v9ses->remotename, newfid,
+			   v9ses->afid, NULL))
+	    < 0) {
+		dprintk(DEBUG_ERROR, "cannot attach\n");
+		goto SessCleanUp;
+	}
+
+	if (v9ses->afid != ~0) {
+		if (v9fs_t_clunk(v9ses, v9ses->afid, NULL))
+			dprintk(DEBUG_ERROR, "clunk failed\n");
+	}
+
+	return newfid;
+
+      FreeFcall:
+	kfree(fcall);
+
+      SessCleanUp:
+	v9fs_session_close(v9ses);
+	return retval;
+}
+
+/**
+ * v9fs_session_close - shutdown a session
+ * @v9ses: session information structure
+ *
+ */
+
+void v9fs_session_close(struct v9fs_session_info *v9ses)
+{
+	if (v9ses->recvproc) {
+		send_sig(SIGKILL, v9ses->recvproc, 1);
+		wait_for_completion(&v9ses->proccmpl);
+	}
+
+	if (v9ses->transport)
+		v9ses->transport->close(v9ses->transport);
+
+	putname(v9ses->name);
+	putname(v9ses->remotename);
+}
+
+/**
+ * v9fs_session_cancel - mark transport as disconnected
+ * 	and cancel all pending requests.
+ */
+void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+	v9ses->transport->status = Disconnected;
+	v9fs_mux_cancel_requests(v9ses, -EIO);
+}
+
+extern int v9fs_error_init(void);
+
+/**
+ * v9fs_init - Initialize module
+ *
+ */
+
+static int __init init_v9fs(void)
+{
+	v9fs_error_init();
+
+	printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
+
+	return register_filesystem(&v9fs_fs_type);
+}
+
+/**
+ * v9fs_init - shutdown module
+ *
+ */
+
+static void __exit exit_v9fs(void)
+{
+	unregister_filesystem(&v9fs_fs_type);
+}
+
+module_init(init_v9fs)
+module_exit(exit_v9fs)
+
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
new file mode 100644
index 00000000000..45dcef42bdd
--- /dev/null
+++ b/fs/9p/v9fs.h
@@ -0,0 +1,103 @@
+/*
+ * V9FS definitions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/*
+  * Idpool structure provides lock and id management
+  *
+  */
+
+struct v9fs_idpool {
+	struct semaphore lock;
+	struct idr pool;
+};
+
+/*
+  * Session structure provides information for an opened session
+  *
+  */
+
+struct v9fs_session_info {
+	/* options */
+	unsigned int maxdata;
+	unsigned char extended;	/* set to 1 if we are using UNIX extensions */
+	unsigned char nodev;	/* set to 1 if no disable device mapping */
+	unsigned short port;	/* port to connect to */
+	unsigned short debug;	/* debug level */
+	unsigned short proto;	/* protocol to use */
+	unsigned int afid;	/* authentication fid */
+	unsigned int rfdno;	/* read file descriptor number */
+	unsigned int wfdno;	/* write file descriptor number */
+
+
+	char *name;		/* user name to mount as */
+	char *remotename;	/* name of remote hierarchy being mounted */
+	unsigned int uid;	/* default uid/muid for legacy support */
+	unsigned int gid;	/* default gid for legacy support */
+
+	/* book keeping */
+	struct v9fs_idpool fidpool;	/* The FID pool for file descriptors */
+	struct v9fs_idpool tidpool;	/* The TID pool for transactions ids */
+
+	/* transport information */
+	struct v9fs_transport *transport;
+
+	int inprogress;		/* session in progress => true */
+	int shutdown;		/* session shutting down. no more attaches. */
+	unsigned char session_hung;
+
+	/* mux private data */
+	struct v9fs_fcall *curfcall;
+	wait_queue_head_t read_wait;
+	struct completion fcread;
+	struct completion proccmpl;
+	struct task_struct *recvproc;
+
+	spinlock_t muxlock;
+	struct list_head mux_fcalls;
+};
+
+/* possible values of ->proto */
+enum {
+	PROTO_TCP,
+	PROTO_UNIX,
+	PROTO_FD,
+};
+
+int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
+struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
+void v9fs_session_close(struct v9fs_session_info *v9ses);
+int v9fs_get_idpool(struct v9fs_idpool *p);
+void v9fs_put_idpool(int id, struct v9fs_idpool *p);
+void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+
+#define V9FS_MAGIC 0x01021997
+
+/* other default globals */
+#define V9FS_PORT		564
+#define V9FS_DEFUSER	"nobody"
+#define V9FS_DEFANAME	""
+
+/* inital pool sizes for fids and tags */
+#define V9FS_START_FIDS 8192
+#define V9FS_START_TIDS 256
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
new file mode 100644
index 00000000000..2f2cea7ee3e
--- /dev/null
+++ b/fs/9p/v9fs_vfs.h
@@ -0,0 +1,53 @@
+/*
+ * V9FS VFS extensions.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+/* plan9 semantics are that created files are implicitly opened.
+ * But linux semantics are that you call create, then open.
+ * the plan9 approach is superior as it provides an atomic
+ * open.
+ * we track the create fid here. When the file is opened, if fidopen is
+ * non-zero, we use the fid and can skip some steps.
+ * there may be a better way to do this, but I don't know it.
+ * one BAD way is to clunk the fid on create, then open it again:
+ * you lose the atomicity of file open
+ */
+
+/* special case:
+ * unlink calls remove, which is an implicit clunk. So we have to track
+ * that kind of thing so that we don't try to clunk a dead fid.
+ */
+
+extern struct file_system_type v9fs_fs_type;
+extern struct file_operations v9fs_file_operations;
+extern struct file_operations v9fs_dir_operations;
+extern struct dentry_operations v9fs_dentry_operations;
+
+struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+ino_t v9fs_qid2ino(struct v9fs_qid *qid);
+void v9fs_mistat2inode(struct v9fs_stat *, struct inode *,
+		       struct super_block *);
+int v9fs_dir_release(struct inode *inode, struct file *filp);
+int v9fs_file_open(struct inode *inode, struct file *file);
+void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat);
+void v9fs_dentry_release(struct dentry *);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
new file mode 100644
index 00000000000..306c96741f8
--- /dev/null
+++ b/fs/9p/vfs_dentry.c
@@ -0,0 +1,126 @@
+/*
+ *  linux/fs/9p/vfs_dentry.c
+ *
+ * This file contians vfs dentry ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+/**
+ * v9fs_dentry_validate - VFS dcache hook to validate cache
+ * @dentry:  dentry that is being validated
+ * @nd: path data
+ *
+ * dcache really shouldn't be used for 9P2000 as at all due to
+ * potential attached semantics to directory traversal (walk).
+ *
+ * FUTURE: look into how to use dcache to allow multi-stage
+ * walks in Plan 9 & potential for better dcache operation which
+ * would remain valid for Plan 9 semantics.  Older versions
+ * had validation via stat for those interested.  However, since
+ * stat has the same approximate overhead as walk there really
+ * is no difference.  The only improvement would be from a
+ * time-decay cache like NFS has and that undermines the
+ * synchronous nature of 9P2000.
+ *
+ */
+
+static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *dc = current->fs->pwd;
+
+	dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
+	if (v9fs_fid_lookup(dentry, FID_OP)) {
+		dprintk(DEBUG_VFS, "VALID\n");
+		return 1;
+	}
+
+	while (dc != NULL) {
+		if (dc == dentry) {
+			dprintk(DEBUG_VFS, "VALID\n");
+			return 1;
+		}
+		if (dc == dc->d_parent)
+			break;
+
+		dc = dc->d_parent;
+	}
+
+	dprintk(DEBUG_VFS, "INVALID\n");
+	return 0;
+}
+
+/**
+ * v9fs_dentry_release - called when dentry is going to be freed
+ * @dentry:  dentry that is being release
+ *
+ */
+
+void v9fs_dentry_release(struct dentry *dentry)
+{
+	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+
+	if (dentry->d_fsdata != NULL) {
+		struct list_head *fid_list = dentry->d_fsdata;
+		struct v9fs_fid *temp = NULL;
+		struct v9fs_fid *current_fid = NULL;
+		struct v9fs_fcall *fcall = NULL;
+
+		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
+			if (v9fs_t_clunk
+			    (current_fid->v9ses, current_fid->fid, &fcall))
+				dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+					FCALL_ERROR(fcall));
+
+			v9fs_put_idpool(current_fid->fid,
+					&current_fid->v9ses->fidpool);
+
+			kfree(fcall);
+			v9fs_fid_destroy(current_fid);
+		}
+
+		kfree(dentry->d_fsdata);	/* free the list_head */
+	}
+}
+
+struct dentry_operations v9fs_dentry_operations = {
+	.d_revalidate = v9fs_dentry_validate,
+	.d_release = v9fs_dentry_release,
+};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
new file mode 100644
index 00000000000..c478a738418
--- /dev/null
+++ b/fs/9p/vfs_dir.c
@@ -0,0 +1,226 @@
+/*
+ * linux/fs/9p/vfs_dir.c
+ *
+ * This file contains vfs directory ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+/**
+ * dt_type - return file type
+ * @mistat: mistat structure
+ *
+ */
+
+static inline int dt_type(struct v9fs_stat *mistat)
+{
+	unsigned long perm = mistat->mode;
+	int rettype = DT_REG;
+
+	if (perm & V9FS_DMDIR)
+		rettype = DT_DIR;
+	if (perm & V9FS_DMSYMLINK)
+		rettype = DT_LNK;
+
+	return rettype;
+}
+
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filep: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *file = filp->private_data;
+	unsigned int i, n;
+	int fid = -1;
+	int ret = 0;
+	struct v9fs_stat *mi = NULL;
+	int over = 0;
+
+	dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name);
+
+	fid = file->fid;
+
+	mi = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	if (!mi)
+		return -ENOMEM;
+
+	if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) {
+		kfree(file->rdir_fcall);
+		file->rdir_fcall = NULL;
+	}
+
+	if (file->rdir_fcall) {
+		n = file->rdir_fcall->params.rread.count;
+		i = file->rdir_fpos;
+		while (i < n) {
+			int s = v9fs_deserialize_stat(v9ses,
+				  file->rdir_fcall->params.rread.data + i,
+			          n - i, mi, v9ses->maxdata);
+
+			if (s == 0) {
+				dprintk(DEBUG_ERROR,
+					"error while deserializing mistat\n");
+				ret = -EIO;
+				goto FreeStructs;
+			}
+
+			over = filldir(dirent, mi->name, strlen(mi->name),
+				    filp->f_pos, v9fs_qid2ino(&mi->qid),
+				    dt_type(mi));
+
+			if (over) {
+				file->rdir_fpos = i;
+				file->rdir_pos = filp->f_pos;
+				break;
+			}
+
+			i += s;
+			filp->f_pos += s;
+		}
+
+		if (!over) {
+			kfree(file->rdir_fcall);
+			file->rdir_fcall = NULL;
+		}
+	}
+
+	while (!over) {
+		ret = v9fs_t_read(v9ses, fid, filp->f_pos,
+					    v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
+		if (ret < 0) {
+			dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
+				ret, fcall);
+			goto FreeStructs;
+		} else if (ret == 0)
+			break;
+
+		n = ret;
+		i = 0;
+		while (i < n) {
+			int s = v9fs_deserialize_stat(v9ses,
+			          fcall->params.rread.data + i, n - i, mi,
+			          v9ses->maxdata);
+
+			if (s == 0) {
+				dprintk(DEBUG_ERROR,
+					"error while deserializing mistat\n");
+				return -EIO;
+			}
+
+			over = filldir(dirent, mi->name, strlen(mi->name),
+				    filp->f_pos, v9fs_qid2ino(&mi->qid),
+				    dt_type(mi));
+
+			if (over) {
+				file->rdir_fcall = fcall;
+				file->rdir_fpos = i;
+				file->rdir_pos = filp->f_pos;
+				fcall = NULL;
+				break;
+			}
+
+			i += s;
+			filp->f_pos += s;
+		}
+
+		kfree(fcall);
+	}
+
+      FreeStructs:
+	kfree(fcall);
+	kfree(mi);
+	return ret;
+}
+
+/**
+ * v9fs_dir_release - close a directory
+ * @inode: inode of the directory
+ * @filp: file pointer to a directory
+ *
+ */
+
+int v9fs_dir_release(struct inode *inode, struct file *filp)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *fid = filp->private_data;
+	int fidnum = -1;
+
+	dprintk(DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp,
+		fid->fid);
+	fidnum = fid->fid;
+
+	filemap_fdatawrite(inode->i_mapping);
+	filemap_fdatawait(inode->i_mapping);
+
+	if (fidnum >= 0) {
+		fid->fidopen--;
+		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
+			fid->fid);
+
+		if (fid->fidopen == 0) {
+			if (v9fs_t_clunk(v9ses, fidnum, NULL))
+				dprintk(DEBUG_ERROR, "clunk failed\n");
+
+			v9fs_put_idpool(fid->fid, &v9ses->fidpool);
+		}
+
+		kfree(fid->rdir_fcall);
+
+		filp->private_data = NULL;
+		v9fs_fid_destroy(fid);
+	}
+
+	d_drop(filp->f_dentry);
+	return 0;
+}
+
+struct file_operations v9fs_dir_operations = {
+	.read = generic_read_dir,
+	.readdir = v9fs_dir_readdir,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
new file mode 100644
index 00000000000..1f8ae7d580a
--- /dev/null
+++ b/fs/9p/vfs_file.c
@@ -0,0 +1,401 @@
+/*
+ *  linux/fs/9p/vfs_file.c
+ *
+ * This file contians vfs file ops for 9P2000.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "fid.h"
+
+/**
+ * v9fs_file_open - open a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ */
+
+int v9fs_file_open(struct inode *inode, struct file *file)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *v9fid = v9fs_fid_lookup(file->f_dentry, FID_WALK);
+	struct v9fs_fid *v9newfid = NULL;
+	struct v9fs_fcall *fcall = NULL;
+	int open_mode = 0;
+	unsigned int iounit = 0;
+	int newfid = -1;
+	long result = -1;
+
+	dprintk(DEBUG_VFS, "inode: %p file: %p v9fid= %p\n", inode, file,
+		v9fid);
+
+	if (!v9fid) {
+		struct dentry *dentry = file->f_dentry;
+		dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
+
+		/* XXX - some duplication from lookup, generalize later */
+		/* basically vfs_lookup is too heavy weight */
+		v9fid = v9fs_fid_lookup(file->f_dentry, FID_OP);
+		if (!v9fid)
+			return -EBADF;
+
+		v9fid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+		if (!v9fid)
+			return -EBADF;
+
+		newfid = v9fs_get_idpool(&v9ses->fidpool);
+		if (newfid < 0) {
+			eprintk(KERN_WARNING, "newfid fails!\n");
+			return -ENOSPC;
+		}
+
+		result =
+		    v9fs_t_walk(v9ses, v9fid->fid, newfid,
+				(char *)file->f_dentry->d_name.name, NULL);
+		if (result < 0) {
+			v9fs_put_idpool(newfid, &v9ses->fidpool);
+			dprintk(DEBUG_ERROR, "rewalk didn't work\n");
+			return -EBADF;
+		}
+
+		v9fid = v9fs_fid_create(dentry);
+		if (v9fid == NULL) {
+			dprintk(DEBUG_ERROR, "couldn't insert\n");
+			return -ENOMEM;
+		}
+		v9fid->fid = newfid;
+	}
+
+	if (v9fid->fidcreate) {
+		/* create case */
+		newfid = v9fid->fid;
+		iounit = v9fid->iounit;
+		v9fid->fidcreate = 0;
+	} else {
+		if (!S_ISDIR(inode->i_mode))
+			newfid = v9fid->fid;
+		else {
+			newfid = v9fs_get_idpool(&v9ses->fidpool);
+			if (newfid < 0) {
+				eprintk(KERN_WARNING, "allocation failed\n");
+				return -ENOSPC;
+			}
+			/* This would be a somewhat critical clone */
+			result =
+			    v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL,
+					&fcall);
+			if (result < 0) {
+				dprintk(DEBUG_ERROR, "clone error: %s\n",
+					FCALL_ERROR(fcall));
+				kfree(fcall);
+				return result;
+			}
+
+			v9newfid = v9fs_fid_create(file->f_dentry);
+			v9newfid->fid = newfid;
+			v9newfid->qid = v9fid->qid;
+			v9newfid->iounit = v9fid->iounit;
+			v9newfid->fidopen = 0;
+			v9newfid->fidclunked = 0;
+			v9newfid->v9ses = v9ses;
+			v9fid = v9newfid;
+			kfree(fcall);
+		}
+
+		/* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
+		/* translate open mode appropriately */
+		open_mode = file->f_flags & 0x3;
+
+		if (file->f_flags & O_EXCL)
+			open_mode |= V9FS_OEXCL;
+
+		if (v9ses->extended) {
+			if (file->f_flags & O_TRUNC)
+				open_mode |= V9FS_OTRUNC;
+
+			if (file->f_flags & O_APPEND)
+				open_mode |= V9FS_OAPPEND;
+		}
+
+		result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
+		if (result < 0) {
+			dprintk(DEBUG_ERROR,
+				"open failed, open_mode 0x%x: %s\n", open_mode,
+				FCALL_ERROR(fcall));
+			kfree(fcall);
+			return result;
+		}
+
+		iounit = fcall->params.ropen.iounit;
+		kfree(fcall);
+	}
+
+
+	file->private_data = v9fid;
+
+	v9fid->rdir_pos = 0;
+	v9fid->rdir_fcall = NULL;
+	v9fid->fidopen = 1;
+	v9fid->filp = file;
+	v9fid->iounit = iounit;
+
+	return 0;
+}
+
+/**
+ * v9fs_file_lock - lock a file (or directory)
+ * @inode: inode to be opened
+ * @file: file being opened
+ *
+ * XXX - this looks like a local only lock, we should extend into 9P
+ *       by using open exclusive
+ */
+
+static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+	int res = 0;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	dprintk(DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+
+	/* No mandatory locks */
+	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+		return -ENOLCK;
+
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+		filemap_fdatawrite(inode->i_mapping);
+		filemap_fdatawait(inode->i_mapping);
+		invalidate_inode_pages(&inode->i_data);
+	}
+
+	return res;
+}
+
+/**
+ * v9fs_read - read from a file (internal)
+ * @filep: file pointer to read
+ * @data: data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
+static ssize_t
+v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *v9f = filp->private_data;
+	struct v9fs_fcall *fcall = NULL;
+	int fid = v9f->fid;
+	int rsize = 0;
+	int result = 0;
+	int total = 0;
+
+	dprintk(DEBUG_VFS, "\n");
+
+	rsize = v9ses->maxdata - V9FS_IOHDRSZ;
+	if (v9f->iounit != 0 && rsize > v9f->iounit)
+		rsize = v9f->iounit;
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		result = v9fs_t_read(v9ses, fid, *offset, rsize, &fcall);
+
+		if (result < 0) {
+			printk(KERN_ERR "9P2000: v9fs_t_read returned %d\n",
+			       result);
+
+			kfree(fcall);
+			return total;
+		} else
+			*offset += result;
+
+		/* XXX - extra copy */
+		memcpy(buffer, fcall->params.rread.data, result);
+		count -= result;
+		buffer += result;
+		total += result;
+
+		kfree(fcall);
+
+		if (result < rsize)
+			break;
+	} while (count);
+
+	return total;
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filep: file pointer to read
+ * @data: data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
+static ssize_t
+v9fs_file_read(struct file *filp, char __user * data, size_t count,
+	       loff_t * offset)
+{
+	int retval = -1;
+	int ret = 0;
+	char *buffer;
+
+	buffer = kmalloc(count, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	retval = v9fs_read(filp, buffer, count, offset);
+	if (retval > 0) {
+		if ((ret = copy_to_user(data, buffer, retval)) != 0) {
+			dprintk(DEBUG_ERROR, "Problem copying to user %d\n",
+				ret);
+			retval = ret;
+		}
+	}
+
+	kfree(buffer);
+
+	return retval;
+}
+
+/**
+ * v9fs_write - write to a file
+ * @filep: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+
+static ssize_t
+v9fs_write(struct file *filp, char *buffer, size_t count, loff_t * offset)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+	struct v9fs_fid *v9fid = filp->private_data;
+	struct v9fs_fcall *fcall;
+	int fid = v9fid->fid;
+	int result = -EIO;
+	int rsize = 0;
+	int total = 0;
+
+	dprintk(DEBUG_VFS, "data %p count %d offset %x\n", buffer, (int)count,
+		(int)*offset);
+	rsize = v9ses->maxdata - V9FS_IOHDRSZ;
+	if (v9fid->iounit != 0 && rsize > v9fid->iounit)
+		rsize = v9fid->iounit;
+
+	dump_data(buffer, count);
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		result =
+		    v9fs_t_write(v9ses, fid, *offset, rsize, buffer, &fcall);
+		if (result < 0) {
+			eprintk(KERN_ERR, "error while writing: %s(%d)\n",
+				FCALL_ERROR(fcall), result);
+			kfree(fcall);
+			return result;
+		} else
+			*offset += result;
+
+		kfree(fcall);
+
+		if (result != rsize) {
+			eprintk(KERN_ERR,
+				"short write: v9fs_t_write returned %d\n",
+				result);
+			break;
+		}
+
+		count -= result;
+		buffer += result;
+		total += result;
+	} while (count);
+
+	return total;
+}
+
+/**
+ * v9fs_file_write - write to a file
+ * @filep: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+		size_t count, loff_t * offset)
+{
+	int ret = -1;
+	char *buffer;
+
+	buffer = kmalloc(count, GFP_KERNEL);
+	if (buffer == NULL)
+		return -ENOMEM;
+
+	ret = copy_from_user(buffer, data, count);
+	if (ret) {
+		dprintk(DEBUG_ERROR, "Problem copying from user\n");
+		ret = -EFAULT;
+	} else {
+		ret = v9fs_write(filp, buffer, count, offset);
+	}
+
+	kfree(buffer);
+
+	return ret;
+}
+
+struct file_operations v9fs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_file_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
new file mode 100644
index 00000000000..0c13fc60004
--- /dev/null
+++ b/fs/9p/vfs_inode.c
@@ -0,0 +1,1338 @@
+/*
+ *  linux/fs/9p/vfs_inode.c
+ *
+ * This file contains vfs inode ops for the 9P2000 protocol.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+static struct inode_operations v9fs_dir_inode_operations;
+static struct inode_operations v9fs_dir_inode_operations_ext;
+static struct inode_operations v9fs_file_inode_operations;
+static struct inode_operations v9fs_symlink_inode_operations;
+
+/**
+ * unixmode2p9mode - convert unix mode bits to plan 9
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+
+static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
+{
+	int res;
+	res = mode & 0777;
+	if (S_ISDIR(mode))
+		res |= V9FS_DMDIR;
+	if (v9ses->extended) {
+		if (S_ISLNK(mode))
+			res |= V9FS_DMSYMLINK;
+		if (v9ses->nodev == 0) {
+			if (S_ISSOCK(mode))
+				res |= V9FS_DMSOCKET;
+			if (S_ISFIFO(mode))
+				res |= V9FS_DMNAMEDPIPE;
+			if (S_ISBLK(mode))
+				res |= V9FS_DMDEVICE;
+			if (S_ISCHR(mode))
+				res |= V9FS_DMDEVICE;
+		}
+
+		if ((mode & S_ISUID) == S_ISUID)
+			res |= V9FS_DMSETUID;
+		if ((mode & S_ISGID) == S_ISGID)
+			res |= V9FS_DMSETGID;
+		if ((mode & V9FS_DMLINK))
+			res |= V9FS_DMLINK;
+	}
+
+	return res;
+}
+
+/**
+ * p9mode2unixmode- convert plan9 mode bits to unix mode bits
+ * @v9ses: v9fs session information
+ * @mode: mode to convert
+ *
+ */
+
+static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+{
+	int res;
+
+	res = mode & 0777;
+
+	if ((mode & V9FS_DMDIR) == V9FS_DMDIR)
+		res |= S_IFDIR;
+	else if ((mode & V9FS_DMSYMLINK) && (v9ses->extended))
+		res |= S_IFLNK;
+	else if ((mode & V9FS_DMSOCKET) && (v9ses->extended)
+		 && (v9ses->nodev == 0))
+		res |= S_IFSOCK;
+	else if ((mode & V9FS_DMNAMEDPIPE) && (v9ses->extended)
+		 && (v9ses->nodev == 0))
+		res |= S_IFIFO;
+	else if ((mode & V9FS_DMDEVICE) && (v9ses->extended)
+		 && (v9ses->nodev == 0))
+		res |= S_IFBLK;
+	else
+		res |= S_IFREG;
+
+	if (v9ses->extended) {
+		if ((mode & V9FS_DMSETUID) == V9FS_DMSETUID)
+			res |= S_ISUID;
+
+		if ((mode & V9FS_DMSETGID) == V9FS_DMSETGID)
+			res |= S_ISGID;
+	}
+
+	return res;
+}
+
+/**
+ * v9fs_blank_mistat - helper function to setup a 9P stat structure
+ * @v9ses: 9P session info (for determining extended mode)
+ * @mistat: structure to initialize
+ *
+ */
+
+static void
+v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
+{
+	mistat->type = ~0;
+	mistat->dev = ~0;
+	mistat->qid.type = ~0;
+	mistat->qid.version = ~0;
+	*((long long *)&mistat->qid.path) = ~0;
+	mistat->mode = ~0;
+	mistat->atime = ~0;
+	mistat->mtime = ~0;
+	mistat->length = ~0;
+	mistat->name = mistat->data;
+	mistat->uid = mistat->data;
+	mistat->gid = mistat->data;
+	mistat->muid = mistat->data;
+	if (v9ses->extended) {
+		mistat->n_uid = ~0;
+		mistat->n_gid = ~0;
+		mistat->n_muid = ~0;
+		mistat->extension = mistat->data;
+	}
+	*mistat->data = 0;
+}
+
+/**
+ * v9fs_mistat2unix - convert mistat to unix stat
+ * @mistat: Plan 9 metadata (mistat) structure
+ * @buf: unix metadata (stat) structure to populate
+ * @sb: superblock
+ *
+ */
+
+static void
+v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
+		 struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL;
+
+	buf->st_nlink = 1;
+
+	buf->st_atime = mistat->atime;
+	buf->st_mtime = mistat->mtime;
+	buf->st_ctime = mistat->mtime;
+
+	buf->st_uid = (unsigned short)-1;
+	buf->st_gid = (unsigned short)-1;
+
+	if (v9ses && v9ses->extended) {
+		/* TODO: string to uid mapping via user-space daemon */
+		if (mistat->n_uid != -1)
+			sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
+
+		if (mistat->n_gid != -1)
+			sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
+	}
+
+	if (buf->st_uid == (unsigned short)-1)
+		buf->st_uid = v9ses->uid;
+	if (buf->st_gid == (unsigned short)-1)
+		buf->st_gid = v9ses->gid;
+
+	buf->st_mode = p9mode2unixmode(v9ses, mistat->mode);
+	if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) {
+		char type = 0;
+		int major = -1;
+		int minor = -1;
+		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			buf->st_mode &= ~S_IFBLK;
+			buf->st_mode |= S_IFCHR;
+			break;
+		case 'b':
+			break;
+		default:
+			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
+				type, mistat->extension);
+		};
+		buf->st_rdev = MKDEV(major, minor);
+	} else
+		buf->st_rdev = 0;
+
+	buf->st_size = mistat->length;
+
+	buf->st_blksize = sb->s_blocksize;
+	buf->st_blocks =
+	    (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits;
+}
+
+/**
+ * v9fs_get_inode - helper function to setup an inode
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode = NULL;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+
+	inode = new_inode(sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = sb->s_blocksize;
+		inode->i_blocks = 0;
+		inode->i_rdev = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+		switch (mode & S_IFMT) {
+		case S_IFIFO:
+		case S_IFBLK:
+		case S_IFCHR:
+		case S_IFSOCK:
+			if(!v9ses->extended) {
+				dprintk(DEBUG_ERROR, "special files without extended mode\n");
+				return ERR_PTR(-EINVAL);
+			}
+			init_special_inode(inode, inode->i_mode,
+					   inode->i_rdev);
+			break;
+		case S_IFREG:
+			inode->i_op = &v9fs_file_inode_operations;
+			inode->i_fop = &v9fs_file_operations;
+			break;
+		case S_IFLNK:
+			if(!v9ses->extended) {
+				dprintk(DEBUG_ERROR, "extended modes used w/o 9P2000.u\n");
+				return ERR_PTR(-EINVAL);
+			}
+			inode->i_op = &v9fs_symlink_inode_operations;
+			break;
+		case S_IFDIR:
+			inode->i_nlink++;
+			if(v9ses->extended)
+				inode->i_op = &v9fs_dir_inode_operations_ext;
+			else
+				inode->i_op = &v9fs_dir_inode_operations;
+			inode->i_fop = &v9fs_dir_operations;
+			break;
+		default:
+			dprintk(DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+				mode, mode & S_IFMT);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		eprintk(KERN_WARNING, "Problem allocating inode\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	return inode;
+}
+
+/**
+ * v9fs_create - helper function to create files and directories
+ * @dir: directory inode file is being created in
+ * @file_dentry: dentry file is being created in
+ * @perm: permissions file is being created with
+ * @open_mode: resulting open mode for file
+ *
+ */
+
+static int
+v9fs_create(struct inode *dir,
+	    struct dentry *file_dentry,
+	    unsigned int perm, unsigned int open_mode)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct super_block *sb = dir->i_sb;
+	struct v9fs_fid *dirfid =
+	    v9fs_fid_lookup(file_dentry->d_parent, FID_WALK);
+	struct v9fs_fid *fid = NULL;
+	struct inode *file_inode = NULL;
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_qid qid;
+	struct stat newstat;
+	int dirfidnum = -1;
+	long newfid = -1;
+	int result = 0;
+	unsigned int iounit = 0;
+
+	perm = unixmode2p9mode(v9ses, perm);
+
+	dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir,
+		file_dentry, perm, open_mode);
+
+	if (!dirfid)
+		return -EBADF;
+
+	dirfidnum = dirfid->fid;
+	if (dirfidnum < 0) {
+		dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n",
+			dir->i_ino);
+		return -EBADF;
+	}
+
+	if (file_dentry->d_inode) {
+		dprintk(DEBUG_ERROR,
+			"Odd. There is an inode for dir %lu, name :%s:\n",
+			dir->i_ino, file_dentry->d_name.name);
+		return -EEXIST;
+	}
+
+	newfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "no free fids available\n");
+		return -ENOSPC;
+	}
+
+	result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		newfid = 0;
+		goto CleanUpFid;
+	}
+
+	kfree(fcall);
+
+	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
+			       perm, open_mode, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "create fails: %s(%d)\n",
+			FCALL_ERROR(fcall), result);
+
+		goto CleanUpFid;
+	}
+
+	iounit = fcall->params.rcreate.iounit;
+	qid = fcall->params.rcreate.qid;
+	kfree(fcall);
+
+	fid = v9fs_fid_create(file_dentry);
+	if (!fid) {
+		result = -ENOMEM;
+		goto CleanUpFid;
+	}
+
+	fid->fid = newfid;
+	fid->fidopen = 0;
+	fid->fidcreate = 1;
+	fid->qid = qid;
+	fid->iounit = iounit;
+	fid->rdir_pos = 0;
+	fid->rdir_fcall = NULL;
+	fid->v9ses = v9ses;
+
+	if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) ||
+	    (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
+	    (perm & V9FS_DMDEVICE))
+		return 0;
+
+	result = v9fs_t_stat(v9ses, newfid, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
+			result);
+		goto CleanUpFid;
+	}
+
+	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
+
+	file_inode = v9fs_get_inode(sb, newstat.st_mode);
+	if ((!file_inode) || IS_ERR(file_inode)) {
+		dprintk(DEBUG_ERROR, "create inode failed\n");
+		result = -EBADF;
+		goto CleanUpFid;
+	}
+
+	v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb);
+	kfree(fcall);
+	d_instantiate(file_dentry, file_inode);
+
+	if (perm & V9FS_DMDIR) {
+		if (v9fs_t_clunk(v9ses, newfid, &fcall))
+			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
+				FCALL_ERROR(fcall));
+
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		kfree(fcall);
+		fid->fidopen = 0;
+		fid->fidcreate = 0;
+		d_drop(file_dentry);
+	}
+
+	return 0;
+
+      CleanUpFid:
+	kfree(fcall);
+
+	if (newfid) {
+		if (v9fs_t_clunk(v9ses, newfid, &fcall))
+			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+				FCALL_ERROR(fcall));
+
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		kfree(fcall);
+	}
+	return result;
+}
+
+/**
+ * v9fs_remove - helper function to remove files and directories
+ * @dir: directory inode that is being deleted
+ * @file:  dentry that is being deleted
+ * @rmdir: removing a directory
+ *
+ */
+
+static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct super_block *sb = NULL;
+	struct v9fs_session_info *v9ses = NULL;
+	struct v9fs_fid *v9fid = NULL;
+	struct inode *file_inode = NULL;
+	int fid = -1;
+	int result = 0;
+
+	dprintk(DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
+		rmdir);
+
+	file_inode = file->d_inode;
+	sb = file_inode->i_sb;
+	v9ses = v9fs_inode2v9ses(file_inode);
+	v9fid = v9fs_fid_lookup(file, FID_OP);
+
+	if (!v9fid) {
+		dprintk(DEBUG_ERROR,
+			"no v9fs_fid\n");
+		return -EBADF;
+	}
+
+	fid = v9fid->fid;
+	if (fid < 0) {
+		dprintk(DEBUG_ERROR, "inode #%lu, no fid!\n",
+			file_inode->i_ino);
+		return -EBADF;
+	}
+
+	result = v9fs_t_remove(v9ses, fid, &fcall);
+	if (result < 0)
+		dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n",
+			FCALL_ERROR(fcall), result);
+	else {
+		v9fs_put_idpool(fid, &v9ses->fidpool);
+		v9fs_fid_destroy(v9fid);
+	}
+
+	kfree(fcall);
+	return result;
+}
+
+/**
+ * v9fs_vfs_create - VFS hook to create files
+ * @inode: directory inode that is being deleted
+ * @dentry:  dentry that is being deleted
+ * @perm: create permissions
+ * @nd: path information
+ *
+ */
+
+static int
+v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
+		struct nameidata *nd)
+{
+	return v9fs_create(inode, dentry, perm, O_RDWR);
+}
+
+/**
+ * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
+ * @inode:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+
+static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode)
+{
+	return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY);
+}
+
+/**
+ * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
+ * @dir:  inode that is being walked from
+ * @dentry: dentry that is being walked to?
+ * @nameidata: path data
+ *
+ */
+
+static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nameidata)
+{
+	struct super_block *sb;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dirfid;
+	struct v9fs_fid *fid;
+	struct inode *inode;
+	struct v9fs_fcall *fcall = NULL;
+	struct stat newstat;
+	int dirfidnum = -1;
+	int newfid = -1;
+	int result = 0;
+
+	dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+		dir, dentry->d_iname, dentry, nameidata);
+
+	sb = dir->i_sb;
+	v9ses = v9fs_inode2v9ses(dir);
+	dirfid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+
+	if (!dirfid) {
+		dprintk(DEBUG_ERROR, "no dirfid\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	dirfidnum = dirfid->fid;
+
+	if (dirfidnum < 0) {
+		dprintk(DEBUG_ERROR, "no dirfid for inode %p, #%lu\n",
+			dir, dir->i_ino);
+		return ERR_PTR(-EBADF);
+	}
+
+	newfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "newfid fails!\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	result =
+	    v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name,
+			NULL);
+	if (result < 0) {
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		if (result == -ENOENT) {
+			d_add(dentry, NULL);
+			dprintk(DEBUG_ERROR,
+				"Return negative dentry %p count %d\n",
+				dentry, atomic_read(&dentry->d_count));
+			return NULL;
+		}
+		dprintk(DEBUG_ERROR, "walk error:%d\n", result);
+		goto FreeFcall;
+	}
+
+	result = v9fs_t_stat(v9ses, newfid, &fcall);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "stat error\n");
+		goto FreeFcall;
+	}
+
+	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
+	inode = v9fs_get_inode(sb, newstat.st_mode);
+
+	if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) {
+		eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
+			PTR_ERR(inode));
+
+		result = -ENOSPC;
+		goto FreeFcall;
+	}
+
+	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+
+	fid = v9fs_fid_create(dentry);
+	if (fid == NULL) {
+		dprintk(DEBUG_ERROR, "couldn't insert\n");
+		result = -ENOMEM;
+		goto FreeFcall;
+	}
+
+	fid->fid = newfid;
+	fid->fidopen = 0;
+	fid->v9ses = v9ses;
+	fid->qid = fcall->params.rstat.stat->qid;
+
+	dentry->d_op = &v9fs_dentry_operations;
+	v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb);
+
+	d_add(dentry, inode);
+	kfree(fcall);
+
+	return NULL;
+
+      FreeFcall:
+	kfree(fcall);
+	return ERR_PTR(result);
+}
+
+/**
+ * v9fs_vfs_unlink - VFS unlink hook to delete an inode
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+
+static int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 0);
+}
+
+/**
+ * v9fs_vfs_rmdir - VFS unlink hook to delete a directory
+ * @i:  inode that is being unlinked
+ * @d: dentry that is being unlinked
+ *
+ */
+
+static int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
+{
+	return v9fs_remove(i, d, 1);
+}
+
+/**
+ * v9fs_vfs_rename - VFS hook to rename an inode
+ * @old_dir:  old dir inode
+ * @old_dentry: old dentry
+ * @new_dir: new dir inode
+ * @new_dentry: new dentry
+ *
+ */
+
+static int
+v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode);
+	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_WALK);
+	struct v9fs_fid *olddirfid =
+	    v9fs_fid_lookup(old_dentry->d_parent, FID_WALK);
+	struct v9fs_fid *newdirfid =
+	    v9fs_fid_lookup(new_dentry->d_parent, FID_WALK);
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_fcall *fcall = NULL;
+	int fid = -1;
+	int olddirfidnum = -1;
+	int newdirfidnum = -1;
+	int retval = 0;
+
+	dprintk(DEBUG_VFS, "\n");
+
+	if (!mistat)
+		return -ENOMEM;
+
+	if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
+		dprintk(DEBUG_ERROR, "problem with arguments\n");
+		return -EBADF;
+	}
+
+	/* 9P can only handle file rename in the same directory */
+	if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
+		dprintk(DEBUG_ERROR, "old dir and new dir are different\n");
+		retval = -EPERM;
+		goto FreeFcallnBail;
+	}
+
+	fid = oldfid->fid;
+	olddirfidnum = olddirfid->fid;
+	newdirfidnum = newdirfid->fid;
+
+	if (fid < 0) {
+		dprintk(DEBUG_ERROR, "no fid for old file #%lu\n",
+			old_inode->i_ino);
+		retval = -EBADF;
+		goto FreeFcallnBail;
+	}
+
+	v9fs_blank_mistat(v9ses, mistat);
+
+	strcpy(mistat->data + 1, v9ses->name);
+	mistat->name = mistat->data + 1 + strlen(v9ses->name);
+
+	if (new_dentry->d_name.len >
+	    (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) {
+		dprintk(DEBUG_ERROR, "new name too long\n");
+		goto FreeFcallnBail;
+	}
+
+	strcpy(mistat->name, new_dentry->d_name.name);
+	retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall);
+
+      FreeFcallnBail:
+	kfree(mistat);
+
+	if (retval < 0)
+		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+			FCALL_ERROR(fcall));
+
+	kfree(fcall);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_getattr - retreive file metadata
+ * @mnt - mount information
+ * @dentry - file to get attributes on
+ * @stat - metadata structure to populate
+ *
+ */
+
+static int
+v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	int err = -EPERM;
+
+	dprintk(DEBUG_VFS, "dentry: %p\n", dentry);
+	if (!fid) {
+		dprintk(DEBUG_ERROR,
+			"couldn't find fid associated with dentry\n");
+		return -EBADF;
+	}
+
+	err = v9fs_t_stat(v9ses, fid->fid, &fcall);
+
+	if (err < 0)
+		dprintk(DEBUG_ERROR, "stat error\n");
+	else {
+		v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode,
+				  dentry->d_inode->i_sb);
+		generic_fillattr(dentry->d_inode, stat);
+	}
+
+	kfree(fcall);
+	return err;
+}
+
+/**
+ * v9fs_vfs_setattr - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+
+static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	int res = -EPERM;
+
+	dprintk(DEBUG_VFS, "\n");
+
+	if (!mistat)
+		return -ENOMEM;
+
+	if (!fid) {
+		dprintk(DEBUG_ERROR,
+			"Couldn't find fid associated with dentry\n");
+		return -EBADF;
+	}
+
+	v9fs_blank_mistat(v9ses, mistat);
+	if (iattr->ia_valid & ATTR_MODE)
+		mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+
+	if (iattr->ia_valid & ATTR_MTIME)
+		mistat->mtime = iattr->ia_mtime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_ATIME)
+		mistat->atime = iattr->ia_atime.tv_sec;
+
+	if (iattr->ia_valid & ATTR_SIZE)
+		mistat->length = iattr->ia_size;
+
+	if (v9ses->extended) {
+		char *ptr = mistat->data+1;
+
+		if (iattr->ia_valid & ATTR_UID) {
+			mistat->uid = ptr;
+			ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid);
+			mistat->n_uid = iattr->ia_uid;
+		}
+
+		if (iattr->ia_valid & ATTR_GID) {
+			mistat->gid = ptr;
+			ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid);
+			mistat->n_gid = iattr->ia_gid;
+		}
+	}
+
+	res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall);
+
+	if (res < 0)
+		dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall));
+
+	kfree(mistat);
+	kfree(fcall);
+
+	if (res >= 0)
+		res = inode_setattr(dentry->d_inode, iattr);
+
+	return res;
+}
+
+/**
+ * v9fs_mistat2inode - populate an inode structure with mistat info
+ * @mistat: Plan 9 metadata (mistat) structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+
+void
+v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
+		  struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	inode->i_nlink = 1;
+
+	inode->i_atime.tv_sec = mistat->atime;
+	inode->i_mtime.tv_sec = mistat->mtime;
+	inode->i_ctime.tv_sec = mistat->mtime;
+
+	inode->i_uid = -1;
+	inode->i_gid = -1;
+
+	if (v9ses->extended) {
+		/* TODO: string to uid mapping via user-space daemon */
+		inode->i_uid = mistat->n_uid;
+		inode->i_gid = mistat->n_gid;
+
+		if (mistat->n_uid == -1)
+			sscanf(mistat->uid, "%x", &inode->i_uid);
+
+		if (mistat->n_gid == -1)
+			sscanf(mistat->gid, "%x", &inode->i_gid);
+	}
+
+	if (inode->i_uid == -1)
+		inode->i_uid = v9ses->uid;
+	if (inode->i_gid == -1)
+		inode->i_gid = v9ses->gid;
+
+	inode->i_mode = p9mode2unixmode(v9ses, mistat->mode);
+	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
+		char type = 0;
+		int major = -1;
+		int minor = -1;
+		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			inode->i_mode &= ~S_IFBLK;
+			inode->i_mode |= S_IFCHR;
+			break;
+		case 'b':
+			break;
+		default:
+			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
+				type, mistat->extension);
+		};
+		inode->i_rdev = MKDEV(major, minor);
+	} else
+		inode->i_rdev = 0;
+
+	inode->i_size = mistat->length;
+
+	inode->i_blksize = sb->s_blocksize;
+	inode->i_blocks =
+	    (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits;
+}
+
+/**
+ * v9fs_qid2ino - convert qid into inode number
+ * @qid: qid to hash
+ *
+ * BUG: potential for inode number collisions?
+ */
+
+ino_t v9fs_qid2ino(struct v9fs_qid *qid)
+{
+	u64 path = qid->path + 2;
+	ino_t i = 0;
+
+	if (sizeof(ino_t) == sizeof(path))
+		memcpy(&i, &path, sizeof(ino_t));
+	else
+		i = (ino_t) (path ^ (path >> 32));
+
+	return i;
+}
+
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See 9P2000.u RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int retval = -EPERM;
+	struct v9fs_fid *newfid;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		symname);
+
+	if (!mistat)
+		return -ENOMEM;
+
+	if (!v9ses->extended) {
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeFcall;
+	}
+
+	/* issue a create */
+	retval = v9fs_create(dir, dentry, S_IFLNK, 0);
+	if (retval != 0)
+		goto FreeFcall;
+
+	newfid = v9fs_fid_lookup(dentry, FID_OP);
+
+	/* issue a twstat */
+	v9fs_blank_mistat(v9ses, mistat);
+	strcpy(mistat->data + 1, symname);
+	mistat->extension = mistat->data + 1;
+	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+	if (retval < 0) {
+		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeFcall;
+	}
+
+	kfree(fcall);
+
+	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeFcall;
+	}
+
+	d_drop(dentry);		/* FID - will this also clunk? */
+
+      FreeFcall:
+	kfree(mistat);
+	kfree(fcall);
+
+	return retval;
+}
+
+/**
+ * v9fs_readlink - read a symlink's location (internal version)
+ * @dentry: dentry for symlink
+ * @buffer: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+
+static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	int retval = -EPERM;
+
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+
+	if (!fid) {
+		dprintk(DEBUG_ERROR, "could not resolve fid from dentry\n");
+		retval = -EBADF;
+		goto FreeFcall;
+	}
+
+	if (!v9ses->extended) {
+		retval = -EBADF;
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeFcall;
+	}
+
+	dprintk(DEBUG_VFS, " %s\n", dentry->d_name.name);
+	retval = v9fs_t_stat(v9ses, fid->fid, &fcall);
+
+	if (retval < 0) {
+		dprintk(DEBUG_ERROR, "stat error\n");
+		goto FreeFcall;
+	}
+
+	if (!fcall)
+		return -EIO;
+
+	if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) {
+		retval = -EINVAL;
+		goto FreeFcall;
+	}
+
+	/* copy extension buffer into buffer */
+	if (strlen(fcall->params.rstat.stat->extension) < buflen)
+		buflen = strlen(fcall->params.rstat.stat->extension);
+
+	memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1);
+
+	retval = buflen;
+
+      FreeFcall:
+	kfree(fcall);
+
+	return retval;
+}
+
+/**
+ * v9fs_vfs_readlink - read a symlink's location
+ * @dentry: dentry for symlink
+ * @buf: buffer to load symlink location into
+ * @buflen: length of buffer
+ *
+ */
+
+static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
+			     int buflen)
+{
+	int retval;
+	int ret;
+	char *link = __getname();
+
+	if (strlen(link) < buflen)
+		buflen = strlen(link);
+
+	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+
+	retval = v9fs_readlink(dentry, link, buflen);
+
+	if (retval > 0) {
+		if ((ret = copy_to_user(buffer, link, retval)) != 0) {
+			dprintk(DEBUG_ERROR, "problem copying to user: %d\n",
+				ret);
+			retval = ret;
+		}
+	}
+
+	putname(link);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_follow_link - follow a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int len = 0;
+	char *link = __getname();
+
+	dprintk(DEBUG_VFS, "%s n", dentry->d_name.name);
+
+	if (!link)
+		link = ERR_PTR(-ENOMEM);
+	else {
+		len = v9fs_readlink(dentry, link, strlen(link));
+
+		if (len < 0) {
+			putname(link);
+			link = ERR_PTR(len);
+		} else
+			link[len] = 0;
+	}
+	nd_set_link(nd, link);
+
+	return NULL;
+}
+
+/**
+ * v9fs_vfs_put_link - release a symlink path
+ * @dentry: dentry for symlink
+ * @nd: nameidata
+ *
+ */
+
+static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+	char *s = nd_get_link(nd);
+
+	dprintk(DEBUG_VFS, " %s %s\n", dentry->d_name.name, s);
+	if (!IS_ERR(s))
+		putname(s);
+}
+
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+/* XXX - lots of code dup'd from symlink and creates,
+ * figure out a better reuse strategy
+ */
+
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int retval = -EPERM;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_OP);
+	struct v9fs_fid *newfid = NULL;
+	char *symname = __getname();
+
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		old_dentry->d_name.name);
+
+	if (!v9ses->extended) {
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeMem;
+	}
+
+	/* get fid of old_dentry */
+	sprintf(symname, "hardlink(%d)\n", oldfid->fid);
+
+	/* issue a create */
+	retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0);
+	if (retval != 0)
+		goto FreeMem;
+
+	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	if (!newfid) {
+		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
+		goto FreeMem;
+	}
+
+	/* issue a twstat */
+	v9fs_blank_mistat(v9ses, mistat);
+	strcpy(mistat->data + 1, symname);
+	mistat->extension = mistat->data + 1;
+	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+	if (retval < 0) {
+		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeMem;
+	}
+
+	kfree(fcall);
+
+	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeMem;
+	}
+
+	d_drop(dentry);		/* FID - will this also clunk? */
+
+	kfree(fcall);
+	fcall = NULL;
+
+      FreeMem:
+	kfree(mistat);
+	kfree(fcall);
+	putname(symname);
+	return retval;
+}
+
+/**
+ * v9fs_vfs_mknod - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @dev_t: device associated with special file
+ *
+ */
+
+static int
+v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	int retval = -EPERM;
+	struct v9fs_fid *newfid;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
+	struct v9fs_fcall *fcall = NULL;
+	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	char *symname = __getname();
+
+	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+
+	if (!mistat)
+		return -ENOMEM;
+
+	if (!new_valid_dev(rdev)) {
+		retval = -EINVAL;
+		goto FreeMem;
+	}
+
+	if (!v9ses->extended) {
+		dprintk(DEBUG_ERROR, "not extended\n");
+		goto FreeMem;
+	}
+
+	/* issue a create */
+	retval = v9fs_create(dir, dentry, mode, 0);
+
+	if (retval != 0)
+		goto FreeMem;
+
+	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	if (!newfid) {
+		dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
+		retval = -EINVAL;
+		goto FreeMem;
+	}
+
+	/* build extension */
+	if (S_ISBLK(mode))
+		sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISCHR(mode))
+		sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev));
+	else if (S_ISFIFO(mode))
+		;	/* DO NOTHING */
+	else {
+		retval = -EINVAL;
+		goto FreeMem;
+	}
+
+	if (!S_ISFIFO(mode)) {
+		/* issue a twstat */
+		v9fs_blank_mistat(v9ses, mistat);
+		strcpy(mistat->data + 1, symname);
+		mistat->extension = mistat->data + 1;
+		retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+		if (retval < 0) {
+			dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
+				FCALL_ERROR(fcall));
+			goto FreeMem;
+		}
+	}
+
+	/* need to update dcache so we show up */
+	kfree(fcall);
+
+	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
+			FCALL_ERROR(fcall));
+		goto FreeMem;
+	}
+
+	d_drop(dentry);		/* FID - will this also clunk? */
+
+      FreeMem:
+	kfree(mistat);
+	kfree(fcall);
+	putname(symname);
+
+	return retval;
+}
+
+static struct inode_operations v9fs_dir_inode_operations_ext = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.symlink = v9fs_vfs_symlink,
+	.link = v9fs_vfs_link,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.readlink = v9fs_vfs_readlink,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static struct inode_operations v9fs_dir_inode_operations = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static struct inode_operations v9fs_file_inode_operations = {
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static struct inode_operations v9fs_symlink_inode_operations = {
+	.readlink = v9fs_vfs_readlink,
+	.follow_link = v9fs_vfs_follow_link,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
new file mode 100644
index 00000000000..868f350b2c5
--- /dev/null
+++ b/fs/9p/vfs_super.c
@@ -0,0 +1,280 @@
+/*
+ *  linux/fs/9p/vfs_super.c
+ *
+ * This file contians superblock ops for 9P2000. It is intended that
+ * you mount this file system on directories.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/idr.h>
+
+#include "debug.h"
+#include "v9fs.h"
+#include "9p.h"
+#include "v9fs_vfs.h"
+#include "conv.h"
+#include "fid.h"
+
+static void v9fs_clear_inode(struct inode *);
+static struct super_operations v9fs_super_ops;
+
+/**
+ * v9fs_clear_inode - release an inode
+ * @inode: inode to release
+ *
+ */
+
+static void v9fs_clear_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+}
+
+/**
+ * v9fs_set_super - set the superblock
+ * @s: super block
+ * @data: file system specific data
+ *
+ */
+
+static int v9fs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return set_anon_super(s, data);
+}
+
+/**
+ * v9fs_fill_super - populate superblock with info
+ * @sb: superblock
+ * @v9ses: session information
+ *
+ */
+
+static void
+v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
+		int flags)
+{
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
+	sb->s_blocksize = 1 << sb->s_blocksize_bits;
+	sb->s_magic = V9FS_MAGIC;
+	sb->s_op = &v9fs_super_ops;
+
+	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+	    MS_NODIRATIME | MS_NOATIME;
+}
+
+/**
+ * v9fs_get_sb - mount a superblock
+ * @fs_type: file system type
+ * @flags: mount flags
+ * @dev_name: device name that was mounted
+ * @data: mount options
+ *
+ */
+
+static struct super_block *v9fs_get_sb(struct file_system_type
+				       *fs_type, int flags,
+				       const char *dev_name, void *data)
+{
+	struct super_block *sb = NULL;
+	struct v9fs_fcall *fcall = NULL;
+	struct inode *inode = NULL;
+	struct dentry *root = NULL;
+	struct v9fs_session_info *v9ses = NULL;
+	struct v9fs_fid *root_fid = NULL;
+	int mode = S_IRWXUGO | S_ISVTX;
+	uid_t uid = current->fsuid;
+	gid_t gid = current->fsgid;
+	int stat_result = 0;
+	int newfid = 0;
+	int retval = 0;
+
+	dprintk(DEBUG_VFS, " \n");
+
+	v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL);
+	if (!v9ses)
+		return ERR_PTR(-ENOMEM);
+
+	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
+		dprintk(DEBUG_ERROR, "problem initiating session\n");
+		retval = newfid;
+		goto free_session;
+	}
+
+	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
+
+	v9fs_fill_super(sb, v9ses, flags);
+
+	inode = v9fs_get_inode(sb, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		retval = PTR_ERR(inode);
+		goto put_back_sb;
+	}
+
+	inode->i_uid = uid;
+	inode->i_gid = gid;
+
+	root = d_alloc_root(inode);
+
+	if (!root) {
+		retval = -ENOMEM;
+		goto release_inode;
+	}
+
+	sb->s_root = root;
+
+	/* Setup the Root Inode */
+	root_fid = v9fs_fid_create(root);
+	if (root_fid == NULL) {
+		retval = -ENOMEM;
+		goto release_dentry;
+	}
+
+	root_fid->fidopen = 0;
+	root_fid->v9ses = v9ses;
+
+	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
+	if (stat_result < 0) {
+		dprintk(DEBUG_ERROR, "stat error\n");
+		v9fs_t_clunk(v9ses, newfid, NULL);
+		v9fs_put_idpool(newfid, &v9ses->fidpool);
+	} else {
+		root_fid->fid = newfid;
+		root_fid->qid = fcall->params.rstat.stat->qid;
+		root->d_inode->i_ino =
+		    v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+		v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb);
+	}
+
+	kfree(fcall);
+
+	if (stat_result < 0) {
+		retval = stat_result;
+		goto release_dentry;
+	}
+
+	return sb;
+
+      release_dentry:
+	dput(sb->s_root);
+
+      release_inode:
+	iput(inode);
+
+      put_back_sb:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	v9fs_session_close(v9ses);
+
+      free_session:
+	kfree(v9ses);
+
+	return ERR_PTR(retval);
+}
+
+/**
+ * v9fs_kill_super - Kill Superblock
+ * @s: superblock
+ *
+ */
+
+static void v9fs_kill_super(struct super_block *s)
+{
+	struct v9fs_session_info *v9ses = s->s_fs_info;
+
+	dprintk(DEBUG_VFS, " %p\n", s);
+
+	v9fs_dentry_release(s->s_root);	/* clunk root */
+
+	kill_anon_super(s);
+
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+	dprintk(DEBUG_VFS, "exiting kill_super\n");
+}
+
+/**
+ * v9fs_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ *
+ */
+
+static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
+
+	if (v9ses->debug != 0)
+		seq_printf(m, ",debug=%u", v9ses->debug);
+	if (v9ses->port != V9FS_PORT)
+		seq_printf(m, ",port=%u", v9ses->port);
+	if (v9ses->maxdata != 9000)
+		seq_printf(m, ",msize=%u", v9ses->maxdata);
+	if (v9ses->afid != ~0)
+		seq_printf(m, ",afid=%u", v9ses->afid);
+	if (v9ses->proto == PROTO_UNIX)
+		seq_puts(m, ",proto=unix");
+	if (v9ses->extended == 0)
+		seq_puts(m, ",noextend");
+	if (v9ses->nodev == 1)
+		seq_puts(m, ",nodevmap");
+	seq_printf(m, ",name=%s", v9ses->name);
+	seq_printf(m, ",aname=%s", v9ses->remotename);
+	seq_printf(m, ",uid=%u", v9ses->uid);
+	seq_printf(m, ",gid=%u", v9ses->gid);
+	return 0;
+}
+
+static void
+v9fs_umount_begin(struct super_block *sb)
+{
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	v9fs_session_cancel(v9ses);
+}
+
+static struct super_operations v9fs_super_ops = {
+	.statfs = simple_statfs,
+	.clear_inode = v9fs_clear_inode,
+	.show_options = v9fs_show_options,
+	.umount_begin = v9fs_umount_begin,
+};
+
+struct file_system_type v9fs_fs_type = {
+	.name = "9P",
+	.get_sb = v9fs_get_sb,
+	.kill_sb = v9fs_kill_super,
+	.owner = THIS_MODULE,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index e54be705835..068ccea2f18 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -382,10 +382,8 @@ config QUOTA
 	  usage (also called disk quotas). Currently, it works for the
 	  ext2, ext3, and reiserfs file system. ext3 also supports journalled
 	  quotas for which you don't need to run quotacheck(8) after an unclean
-	  shutdown. You need additional software in order to use quota support
-	  (you can download sources from
-	  <http://www.sf.net/projects/linuxquota/>). For further details, read
-	  the Quota mini-HOWTO, available from
+	  shutdown.
+	  For further details, read the Quota mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
 	  with the quota tools. Probably the quota support is only useful for
 	  multi user systems. If unsure, say N.
@@ -403,8 +401,7 @@ config QFMT_V2
 	depends on QUOTA
 	help
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-	  need this functionality say Y here. Note that you will need recent
-	  quota utilities (>= 3.01) for new quota format with this kernel.
+	  need this functionality say Y here.
 
 config QUOTACTL
 	bool
@@ -465,6 +462,19 @@ config AUTOFS4_FS
 	  local network, you probably do not need an automounter, and can say
 	  N here.
 
+config FUSE_FS
+	tristate "Filesystem in Userspace support"
+	help
+	  With FUSE it is possible to implement a fully functional filesystem
+	  in a userspace program.
+
+	  There's also companion library: libfuse.  This library along with
+	  utilities is available from the FUSE homepage:
+	  <http://fuse.sourceforge.net/>
+
+	  If you want to develop a userspace FS, or if you want to use
+	  a filesystem based on FUSE, answer Y or M.
+
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
@@ -783,28 +793,6 @@ config SYSFS
 
 	Designers of embedded systems may wish to say N here to conserve space.
 
-config DEVPTS_FS_XATTR
-	bool "/dev/pts Extended Attributes"
-	depends on UNIX98_PTYS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config DEVPTS_FS_SECURITY
-	bool "/dev/pts Security Labels"
-	depends on DEVPTS_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the /dev/pts filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
 config TMPFS
 	bool "Virtual memory file system support (former shm fs)"
 	help
@@ -817,27 +805,6 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
-config TMPFS_XATTR
-	bool "tmpfs Extended Attributes"
-	depends on TMPFS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config TMPFS_SECURITY
-	bool "tmpfs Security Labels"
-	depends on TMPFS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the tmpfs filesystem.
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN
@@ -859,6 +826,18 @@ config RAMFS
 	  To compile this as a module, choose M here: the module will be called
 	  ramfs.
 
+config RELAYFS_FS
+	tristate "Relayfs file system support"
+	---help---
+	  Relayfs is a high-speed data relay filesystem designed to provide
+	  an efficient mechanism for tools and facilities to relay large
+	  amounts of data from kernel space to user space.
+
+	  To compile this code as a module, choose M here: the module will be
+	  called relayfs.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
@@ -1737,6 +1716,17 @@ config AFS_FS
 config RXRPC
 	tristate
 
+config 9P_FS
+	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+	depends on INET && EXPERIMENTAL
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Partition Types"
diff --git a/fs/Makefile b/fs/Makefile
index cf95eb894fd..1972da18627 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -89,10 +89,13 @@ obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
 obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
+obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_UDF_FS)		+= udf/
+obj-$(CONFIG_RELAYFS_FS)	+= relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
+obj-$(CONFIG_9P_FS)		+= 9p/
 obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 7aa6f200453..9ebe881c678 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -255,6 +255,7 @@ void
 affs_delete_inode(struct inode *inode)
 {
 	pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	if (S_ISREG(inode->i_mode))
 		affs_truncate(inode);
diff --git a/fs/aio.c b/fs/aio.c
index 06d7d4390fe..38f62680fd6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/rcuref.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -499,7 +500,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
@@ -546,6 +547,24 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ioctx;
 }
 
+static int lock_kiocb_action(void *param)
+{
+	schedule();
+	return 0;
+}
+
+static inline void lock_kiocb(struct kiocb *iocb)
+{
+	wait_on_bit_lock(&iocb->ki_flags, KIF_LOCKED, lock_kiocb_action,
+			 TASK_UNINTERRUPTIBLE);
+}
+
+static inline void unlock_kiocb(struct kiocb *iocb)
+{
+	kiocbClearLocked(iocb);
+	wake_up_bit(&iocb->ki_flags, KIF_LOCKED);
+}
+
 /*
  * use_mm
  *	Makes the calling kernel thread take on the specified
@@ -567,6 +586,10 @@ static void use_mm(struct mm_struct *mm)
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
+	/*
+	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
+	 * it won't work. Update it accordingly if you change it here
+	 */
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
 
@@ -782,7 +805,9 @@ static int __aio_run_iocbs(struct kioctx *ctx)
 		 * Hold an extra reference while retrying i/o.
 		 */
 		iocb->ki_users++;       /* grab extra reference */
+		lock_kiocb(iocb);
 		aio_run_iocb(iocb);
+		unlock_kiocb(iocb);
 		if (__aio_put_req(ctx, iocb))  /* drop extra ref */
 			put_ioctx(ctx);
  	}
@@ -1523,10 +1548,9 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	if (likely(list_empty(&ctx->run_list))) {
-		aio_run_iocb(req);
-	} else {
-		list_add_tail(&req->ki_run_list, &ctx->run_list);
+	aio_run_iocb(req);
+	unlock_kiocb(req);
+	if (!list_empty(&ctx->run_list)) {
 		/* drain the run list */
 		while (__aio_run_iocbs(ctx))
 			;
@@ -1657,6 +1681,7 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 	if (NULL != cancel) {
 		struct io_event tmp;
 		pr_debug("calling cancel\n");
+		lock_kiocb(kiocb);
 		memset(&tmp, 0, sizeof(tmp));
 		tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
 		tmp.data = kiocb->ki_user_data;
@@ -1668,8 +1693,9 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 			if (copy_to_user(result, &tmp, sizeof(tmp)))
 				ret = -EFAULT;
 		}
+		unlock_kiocb(kiocb);
 	} else
-		printk(KERN_DEBUG "iocb has no cancel operation\n");
+		ret = -EINVAL;
 
 	put_ioctx(ctx);
 
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 6171431272d..990c28da5ae 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_sb_info {
 	struct file *pipe;
 	pid_t oz_pgrp;
 	int catatonic;
+	struct super_block *sb;
 	unsigned long exp_timeout;
 	ino_t next_dir_ino;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
@@ -134,7 +135,7 @@ void autofs_hash_insert(struct autofs_dirhash *,struct autofs_dir_ent *);
 void autofs_hash_delete(struct autofs_dir_ent *);
 struct autofs_dir_ent *autofs_hash_enum(const struct autofs_dirhash *,off_t *,struct autofs_dir_ent *);
 void autofs_hash_dputall(struct autofs_dirhash *);
-void autofs_hash_nuke(struct autofs_dirhash *);
+void autofs_hash_nuke(struct autofs_sb_info *);
 
 /* Expiration-handling functions */
 
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 448143fd079..5ccfcf26310 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -232,13 +232,13 @@ void autofs_hash_dputall(struct autofs_dirhash *dh)
 
 /* Delete everything.  This is used on filesystem destruction, so we
    make no attempt to keep the pointers valid */
-void autofs_hash_nuke(struct autofs_dirhash *dh)
+void autofs_hash_nuke(struct autofs_sb_info *sbi)
 {
 	int i;
 	struct autofs_dir_ent *ent, *nent;
 
 	for ( i = 0 ; i < AUTOFS_HASH_SIZE ; i++ ) {
-		for ( ent = dh->h[i] ; ent ; ent = nent ) {
+		for ( ent = sbi->dirhash.h[i] ; ent ; ent = nent ) {
 			nent = ent->next;
 			if ( ent->dentry )
 				dput(ent->dentry);
@@ -246,4 +246,5 @@ void autofs_hash_nuke(struct autofs_dirhash *dh)
 			kfree(ent);
 		}
 	}
+	shrink_dcache_sb(sbi->sb);
 }
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 4888c1fabbf..65e5ed42190 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -27,7 +27,7 @@ static void autofs_put_super(struct super_block *sb)
 	if ( !sbi->catatonic )
 		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
 
-	autofs_hash_nuke(&sbi->dirhash);
+	autofs_hash_nuke(sbi);
 	for ( n = 0 ; n < AUTOFS_MAX_SYMLINKS ; n++ ) {
 		if ( test_bit(n, sbi->symlink_bitmap) )
 			kfree(sbi->symlink[n].data);
@@ -148,6 +148,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	s->s_magic = AUTOFS_SUPER_MAGIC;
 	s->s_op = &autofs_sops;
 	s->s_time_gran = 1;
+	sbi->sb = s;
 
 	root_inode = iget(s, AUTOFS_ROOT_INO);
 	root = d_alloc_root(root_inode);
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 1020dbc88be..1fbc53f14ab 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -20,7 +20,6 @@ struct bfs_sb_info {
 	unsigned long si_lasti;
 	unsigned long * si_imap;
 	struct buffer_head * si_sbh;		/* buffer header w/superblock */
-	struct bfs_super_block * si_bfs_sb;	/* superblock in si_sbh->b_data */
 };
 
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 5a1e5ce057f..e240c335eb2 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -2,6 +2,7 @@
  *	fs/bfs/dir.c
  *	BFS directory operations.
  *	Copyright (C) 1999,2000  Tigran Aivazian <tigran@veritas.com>
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005
  */
 
 #include <linux/time.h>
@@ -20,9 +21,9 @@
 #define dprintf(x...)
 #endif
 
-static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int ino);
+static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino);
 static struct buffer_head * bfs_find_entry(struct inode * dir, 
-	const char * name, int namelen, struct bfs_dirent ** res_dir);
+	const unsigned char * name, int namelen, struct bfs_dirent ** res_dir);
 
 static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 {
@@ -53,7 +54,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 			de = (struct bfs_dirent *)(bh->b_data + offset);
 			if (de->ino) {
 				int size = strnlen(de->name, BFS_NAMELEN);
-				if (filldir(dirent, de->name, size, f->f_pos, de->ino, DT_UNKNOWN) < 0) {
+				if (filldir(dirent, de->name, size, f->f_pos, le16_to_cpu(de->ino), DT_UNKNOWN) < 0) {
 					brelse(bh);
 					unlock_kernel();
 					return 0;
@@ -107,7 +108,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	inode->i_mapping->a_ops = &bfs_aops;
 	inode->i_mode = mode;
 	inode->i_ino = ino;
-	BFS_I(inode)->i_dsk_ino = ino;
+	BFS_I(inode)->i_dsk_ino = cpu_to_le16(ino);
 	BFS_I(inode)->i_sblock = 0;
 	BFS_I(inode)->i_eblock = 0;
 	insert_inode_hash(inode);
@@ -139,7 +140,7 @@ static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, st
 	lock_kernel();
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
 	if (bh) {
-		unsigned long ino = le32_to_cpu(de->ino);
+		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
 		brelse(bh);
 		inode = iget(dir->i_sb, ino);
 		if (!inode) {
@@ -183,7 +184,7 @@ static int bfs_unlink(struct inode * dir, struct dentry * dentry)
 	inode = dentry->d_inode;
 	lock_kernel();
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
-	if (!bh || de->ino != inode->i_ino) 
+	if (!bh || le16_to_cpu(de->ino) != inode->i_ino)
 		goto out_brelse;
 
 	if (!inode->i_nlink) {
@@ -224,7 +225,7 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
 				old_dentry->d_name.name, 
 				old_dentry->d_name.len, &old_de);
 
-	if (!old_bh || old_de->ino != old_inode->i_ino)
+	if (!old_bh || le16_to_cpu(old_de->ino) != old_inode->i_ino)
 		goto end_rename;
 
 	error = -EPERM;
@@ -270,7 +271,7 @@ struct inode_operations bfs_dir_inops = {
 	.rename			= bfs_rename,
 };
 
-static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int ino)
+static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino)
 {
 	struct buffer_head * bh;
 	struct bfs_dirent * de;
@@ -304,7 +305,7 @@ static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int
 				}
 				dir->i_mtime = CURRENT_TIME_SEC;
 				mark_inode_dirty(dir);
-				de->ino = ino;
+				de->ino = cpu_to_le16((u16)ino);
 				for (i=0; i<BFS_NAMELEN; i++)
 					de->name[i] = (i < namelen) ? name[i] : 0;
 				mark_buffer_dirty(bh);
@@ -317,7 +318,7 @@ static int bfs_add_entry(struct inode * dir, const char * name, int namelen, int
 	return -ENOSPC;
 }
 
-static inline int bfs_namecmp(int len, const char * name, const char * buffer)
+static inline int bfs_namecmp(int len, const unsigned char * name, const char * buffer)
 {
 	if (len < BFS_NAMELEN && buffer[len])
 		return 0;
@@ -325,7 +326,7 @@ static inline int bfs_namecmp(int len, const char * name, const char * buffer)
 }
 
 static struct buffer_head * bfs_find_entry(struct inode * dir, 
-	const char * name, int namelen, struct bfs_dirent ** res_dir)
+	const unsigned char * name, int namelen, struct bfs_dirent ** res_dir)
 {
 	unsigned long block, offset;
 	struct buffer_head * bh;
@@ -346,7 +347,7 @@ static struct buffer_head * bfs_find_entry(struct inode * dir,
 		}
 		de = (struct bfs_dirent *)(bh->b_data + offset);
 		offset += BFS_DIRENT_SIZE;
-		if (de->ino && bfs_namecmp(namelen, name, de->name)) {
+		if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) {
 			*res_dir = de;
 			return bh;
 		}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 747fd1ea55e..807723b65da 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -40,8 +40,8 @@ static int bfs_move_block(unsigned long from, unsigned long to, struct super_blo
 	return 0;
 }
 
-static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned long end, 
-				unsigned long where)
+static int bfs_move_blocks(struct super_block *sb, unsigned long start,
+                           unsigned long end, unsigned long where)
 {
 	unsigned long i;
 
@@ -57,20 +57,21 @@ static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned
 static int bfs_get_block(struct inode * inode, sector_t block, 
 	struct buffer_head * bh_result, int create)
 {
-	long phys;
+	unsigned long phys;
 	int err;
 	struct super_block *sb = inode->i_sb;
 	struct bfs_sb_info *info = BFS_SB(sb);
 	struct bfs_inode_info *bi = BFS_I(inode);
 	struct buffer_head *sbh = info->si_sbh;
 
-	if (block < 0 || block > info->si_blocks)
+	if (block > info->si_blocks)
 		return -EIO;
 
 	phys = bi->i_sblock + block;
 	if (!create) {
 		if (phys <= bi->i_eblock) {
-			dprintf("c=%d, b=%08lx, phys=%08lx (granted)\n", create, block, phys);
+			dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n",
+                                create, (unsigned long)block, phys);
 			map_bh(bh_result, sb, phys);
 		}
 		return 0;
@@ -80,7 +81,7 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	   of blocks allocated for this file, we can grant it */
 	if (inode->i_size && phys <= bi->i_eblock) {
 		dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
-				create, block, phys);
+				create, (unsigned long)block, phys);
 		map_bh(bh_result, sb, phys);
 		return 0;
 	}
@@ -88,11 +89,12 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	/* the rest has to be protected against itself */
 	lock_kernel();
 
-	/* if the last data block for this file is the last allocated block, we can
-	   extend the file trivially, without moving it anywhere */
+	/* if the last data block for this file is the last allocated
+	   block, we can extend the file trivially, without moving it
+	   anywhere */
 	if (bi->i_eblock == info->si_lf_eblk) {
 		dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
-				create, block, phys);
+				create, (unsigned long)block, phys);
 		map_bh(bh_result, sb, phys);
 		info->si_freeb -= phys - bi->i_eblock;
 		info->si_lf_eblk = bi->i_eblock = phys;
@@ -114,7 +116,8 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	} else
 		err = 0;
 
-	dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", create, block, phys);
+	dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n",
+                create, (unsigned long)block, phys);
 	bi->i_sblock = phys;
 	phys += block;
 	info->si_lf_eblk = bi->i_eblock = phys;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 64e0fb33fc0..c7b39aa279d 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -3,6 +3,8 @@
  *	BFS superblock and inode operations.
  *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
  *	From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
+ *
+ *      Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
  */
 
 #include <linux/module.h>
@@ -54,46 +56,50 @@ static void bfs_read_inode(struct inode * inode)
 	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
 	di = (struct bfs_inode *)bh->b_data + off;
 
-	inode->i_mode = 0x0000FFFF & di->i_mode;
-	if (di->i_vtype == BFS_VDIR) {
+	inode->i_mode = 0x0000FFFF &  le32_to_cpu(di->i_mode);
+	if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
 		inode->i_mode |= S_IFDIR;
 		inode->i_op = &bfs_dir_inops;
 		inode->i_fop = &bfs_dir_operations;
-	} else if (di->i_vtype == BFS_VREG) {
+	} else if (le32_to_cpu(di->i_vtype) == BFS_VREG) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &bfs_file_inops;
 		inode->i_fop = &bfs_file_operations;
 		inode->i_mapping->a_ops = &bfs_aops;
 	}
 
-	inode->i_uid = di->i_uid;
-	inode->i_gid = di->i_gid;
-	inode->i_nlink = di->i_nlink;
+	BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
+	BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
+	inode->i_uid =  le32_to_cpu(di->i_uid);
+	inode->i_gid =  le32_to_cpu(di->i_gid);
+	inode->i_nlink =  le32_to_cpu(di->i_nlink);
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
+        if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
 	inode->i_blksize = PAGE_SIZE;
-	inode->i_atime.tv_sec = di->i_atime;
-	inode->i_mtime.tv_sec = di->i_mtime;
-	inode->i_ctime.tv_sec = di->i_ctime;
+	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
+	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
+	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
 	inode->i_atime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	BFS_I(inode)->i_dsk_ino = di->i_ino; /* can be 0 so we store a copy */
-	BFS_I(inode)->i_sblock = di->i_sblock;
-	BFS_I(inode)->i_eblock = di->i_eblock;
+	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); /* can be 0 so we store a copy */
 
 	brelse(bh);
 }
 
 static int bfs_write_inode(struct inode * inode, int unused)
 {
-	unsigned long ino = inode->i_ino;
+	unsigned int ino = (u16)inode->i_ino;
+        unsigned long i_sblock;
 	struct bfs_inode * di;
 	struct buffer_head * bh;
 	int block, off;
 
+        dprintf("ino=%08x\n", ino);
+
 	if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
-		printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
+		printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
 		return -EIO;
 	}
 
@@ -101,7 +107,7 @@ static int bfs_write_inode(struct inode * inode, int unused)
 	block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh) {
-		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+		printf("Unable to read inode %s:%08x\n", inode->i_sb->s_id, ino);
 		unlock_kernel();
 		return -EIO;
 	}
@@ -109,24 +115,26 @@ static int bfs_write_inode(struct inode * inode, int unused)
 	off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
 	di = (struct bfs_inode *)bh->b_data + off;
 
-	if (inode->i_ino == BFS_ROOT_INO)
-		di->i_vtype = BFS_VDIR;
+	if (ino == BFS_ROOT_INO)
+		di->i_vtype = cpu_to_le32(BFS_VDIR);
 	else
-		di->i_vtype = BFS_VREG;
-
-	di->i_ino = inode->i_ino;
-	di->i_mode = inode->i_mode;
-	di->i_uid = inode->i_uid;
-	di->i_gid = inode->i_gid;
-	di->i_nlink = inode->i_nlink;
-	di->i_atime = inode->i_atime.tv_sec;
-	di->i_mtime = inode->i_mtime.tv_sec;
-	di->i_ctime = inode->i_ctime.tv_sec;
-	di->i_sblock = BFS_I(inode)->i_sblock;
-	di->i_eblock = BFS_I(inode)->i_eblock;
-	di->i_eoffset = di->i_sblock * BFS_BSIZE + inode->i_size - 1;
+		di->i_vtype = cpu_to_le32(BFS_VREG);
+
+	di->i_ino = cpu_to_le16(ino);
+	di->i_mode = cpu_to_le32(inode->i_mode);
+	di->i_uid = cpu_to_le32(inode->i_uid);
+	di->i_gid = cpu_to_le32(inode->i_gid);
+	di->i_nlink = cpu_to_le32(inode->i_nlink);
+	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+        i_sblock = BFS_I(inode)->i_sblock;
+	di->i_sblock = cpu_to_le32(i_sblock);
+	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
+	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
+        dprintf("Written ino=%d into %d:%d\n",le16_to_cpu(di->i_ino),block,off);
 	brelse(bh);
 	unlock_kernel();
 	return 0;
@@ -140,11 +148,14 @@ static void bfs_delete_inode(struct inode * inode)
 	int block, off;
 	struct super_block * s = inode->i_sb;
 	struct bfs_sb_info * info = BFS_SB(s);
+	struct bfs_inode_info * bi = BFS_I(inode);
 
-	dprintf("ino=%08lx\n", inode->i_ino);
+	dprintf("ino=%08lx\n", ino);
 
-	if (inode->i_ino < BFS_ROOT_INO || inode->i_ino > info->si_lasti) {
-		printf("invalid ino=%08lx\n", inode->i_ino);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (ino < BFS_ROOT_INO || ino > info->si_lasti) {
+		printf("invalid ino=%08lx\n", ino);
 		return;
 	}
 	
@@ -160,13 +171,13 @@ static void bfs_delete_inode(struct inode * inode)
 		return;
 	}
 	off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
-	di = (struct bfs_inode *)bh->b_data + off;
-	if (di->i_ino) {
-		info->si_freeb += BFS_FILEBLOCKS(di);
+	di = (struct bfs_inode *) bh->b_data + off;
+        if (bi->i_dsk_ino) {
+		info->si_freeb += 1 + bi->i_eblock - bi->i_sblock;
 		info->si_freei++;
-		clear_bit(di->i_ino, info->si_imap);
+		clear_bit(ino, info->si_imap);
 		dump_imap("delete_inode", s);
-	}
+        }
 	di->i_ino = 0;
 	di->i_sblock = 0;
 	mark_buffer_dirty(bh);
@@ -272,14 +283,14 @@ static struct super_operations bfs_sops = {
 
 void dump_imap(const char *prefix, struct super_block * s)
 {
-#if 0
+#ifdef DEBUG
 	int i;
 	char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL);
 
 	if (!tmpbuf)
 		return;
 	for (i=BFS_SB(s)->si_lasti; i>=0; i--) {
-		if (i>PAGE_SIZE-100) break;
+		if (i > PAGE_SIZE-100) break;
 		if (test_bit(i, BFS_SB(s)->si_imap))
 			strcat(tmpbuf, "1");
 		else
@@ -295,7 +306,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	struct buffer_head * bh;
 	struct bfs_super_block * bfs_sb;
 	struct inode * inode;
-	int i, imap_len;
+	unsigned i, imap_len;
 	struct bfs_sb_info * info;
 
 	info = kmalloc(sizeof(*info), GFP_KERNEL);
@@ -310,19 +321,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	if(!bh)
 		goto out;
 	bfs_sb = (struct bfs_super_block *)bh->b_data;
-	if (bfs_sb->s_magic != BFS_MAGIC) {
+	if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
 		if (!silent)
 			printf("No BFS filesystem on %s (magic=%08x)\n", 
-				s->s_id, bfs_sb->s_magic);
+				s->s_id,  le32_to_cpu(bfs_sb->s_magic));
 		goto out;
 	}
 	if (BFS_UNCLEAN(bfs_sb, s) && !silent)
 		printf("%s is unclean, continuing\n", s->s_id);
 
 	s->s_magic = BFS_MAGIC;
-	info->si_bfs_sb = bfs_sb;
 	info->si_sbh = bh;
-	info->si_lasti = (bfs_sb->s_start - BFS_BSIZE)/sizeof(struct bfs_inode) 
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE)/sizeof(struct bfs_inode)
 			+ BFS_ROOT_INO - 1;
 
 	imap_len = info->si_lasti/8 + 1;
@@ -346,8 +356,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		goto out;
 	}
 
-	info->si_blocks = (bfs_sb->s_end + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
-	info->si_freeb = (bfs_sb->s_end + 1 - bfs_sb->s_start)>>BFS_BSIZE_BITS;
+	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  cpu_to_le32(bfs_sb->s_start))>>BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
 	info->si_lf_sblk = 0;
diff --git a/fs/bio.c b/fs/bio.c
index 1f2d4649b18..83a34957456 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
 
@@ -104,18 +105,22 @@ static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int
 	return bvl;
 }
 
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_destructor(struct bio *bio)
+void bio_free(struct bio *bio, struct bio_set *bio_set)
 {
 	const int pool_idx = BIO_POOL_IDX(bio);
-	struct bio_set *bs = bio->bi_set;
 
 	BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
 
-	mempool_free(bio->bi_io_vec, bs->bvec_pools[pool_idx]);
-	mempool_free(bio, bs->bio_pool);
+	mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
+	mempool_free(bio, bio_set->bio_pool);
+}
+
+/*
+ * default destructor for a bio allocated with bio_alloc_bioset()
+ */
+static void bio_fs_destructor(struct bio *bio)
+{
+	bio_free(bio, fs_bio_set);
 }
 
 inline void bio_init(struct bio *bio)
@@ -171,8 +176,6 @@ struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, stru
 			bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
 		}
 		bio->bi_io_vec = bvl;
-		bio->bi_destructor = bio_destructor;
-		bio->bi_set = bs;
 	}
 out:
 	return bio;
@@ -180,7 +183,12 @@ out:
 
 struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs)
 {
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+
+	if (bio)
+		bio->bi_destructor = bio_fs_destructor;
+
+	return bio;
 }
 
 void zero_fill_bio(struct bio *bio)
@@ -273,8 +281,10 @@ struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
-	if (b)
+	if (b) {
+		b->bi_destructor = bio_fs_destructor;
 		__bio_clone(b, bio);
+	}
 
 	return b;
 }
@@ -546,22 +556,34 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
-static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
-				  unsigned long uaddr, unsigned int len,
-				  int write_to_vm)
+static struct bio *__bio_map_user_iov(request_queue_t *q,
+				      struct block_device *bdev,
+				      struct sg_iovec *iov, int iov_count,
+				      int write_to_vm)
 {
-	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = uaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
-	int ret, offset, i;
+	int i, j;
+	int nr_pages = 0;
 	struct page **pages;
 	struct bio *bio;
+	int cur_page = 0;
+	int ret, offset;
 
-	/*
-	 * transfer and buffer must be aligned to at least hardsector
-	 * size for now, in the future we can relax this restriction
-	 */
-	if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q)))
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+		unsigned long len = iov[i].iov_len;
+		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long start = uaddr >> PAGE_SHIFT;
+
+		nr_pages += end - start;
+		/*
+		 * transfer and buffer must be aligned to at least hardsector
+		 * size for now, in the future we can relax this restriction
+		 */
+		if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q)))
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
 	bio = bio_alloc(GFP_KERNEL, nr_pages);
@@ -573,42 +595,54 @@ static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
 	if (!pages)
 		goto out;
 
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(current, current->mm, uaddr, nr_pages,
-						write_to_vm, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-
-	if (ret < nr_pages)
-		goto out;
-
-	bio->bi_bdev = bdev;
-
-	offset = uaddr & ~PAGE_MASK;
-	for (i = 0; i < nr_pages; i++) {
-		unsigned int bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
+	memset(pages, 0, nr_pages * sizeof(struct page *));
+
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+		unsigned long len = iov[i].iov_len;
+		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long start = uaddr >> PAGE_SHIFT;
+		const int local_nr_pages = end - start;
+		const int page_limit = cur_page + local_nr_pages;
+		
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(current, current->mm, uaddr,
+				     local_nr_pages,
+				     write_to_vm, 0, &pages[cur_page], NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (ret < local_nr_pages)
+			goto out_unmap;
+
+
+		offset = uaddr & ~PAGE_MASK;
+		for (j = cur_page; j < page_limit; j++) {
+			unsigned int bytes = PAGE_SIZE - offset;
+
+			if (len <= 0)
+				break;
+			
+			if (bytes > len)
+				bytes = len;
+
+			/*
+			 * sorry...
+			 */
+			if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes)
+				break;
+
+			len -= bytes;
+			offset = 0;
+		}
 
+		cur_page = j;
 		/*
-		 * sorry...
+		 * release the pages we didn't map into the bio, if any
 		 */
-		if (__bio_add_page(q, bio, pages[i], bytes, offset) < bytes)
-			break;
-
-		len -= bytes;
-		offset = 0;
+		while (j < page_limit)
+			page_cache_release(pages[j++]);
 	}
 
-	/*
-	 * release the pages we didn't map into the bio, if any
-	 */
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
-
 	kfree(pages);
 
 	/*
@@ -617,9 +651,17 @@ static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
 	if (!write_to_vm)
 		bio->bi_rw |= (1 << BIO_RW);
 
+	bio->bi_bdev = bdev;
 	bio->bi_flags |= (1 << BIO_USER_MAPPED);
 	return bio;
-out:
+
+ out_unmap:
+	for (i = 0; i < nr_pages; i++) {
+		if(!pages[i])
+			break;
+		page_cache_release(pages[i]);
+	}
+ out:
 	kfree(pages);
 	bio_put(bio);
 	return ERR_PTR(ret);
@@ -639,9 +681,33 @@ out:
 struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
 			 unsigned long uaddr, unsigned int len, int write_to_vm)
 {
+	struct sg_iovec iov;
+
+	iov.iov_base = (void __user *)uaddr;
+	iov.iov_len = len;
+
+	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+}
+
+/**
+ *	bio_map_user_iov - map user sg_iovec table into bio
+ *	@q: the request_queue_t for the bio
+ *	@bdev: destination block device
+ *	@iov:	the iovec.
+ *	@iov_count: number of elements in the iovec
+ *	@write_to_vm: bool indicating writing to pages or not
+ *
+ *	Map the user space address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev,
+			     struct sg_iovec *iov, int iov_count,
+			     int write_to_vm)
+{
 	struct bio *bio;
+	int len = 0, i;
 
-	bio = __bio_map_user(q, bdev, uaddr, len, write_to_vm);
+	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
 
 	if (IS_ERR(bio))
 		return bio;
@@ -654,6 +720,9 @@ struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
 	 */
 	bio_get(bio);
 
+	for (i = 0; i < iov_count; i++)
+		len += iov[i].iov_len;
+
 	if (bio->bi_size == len)
 		return bio;
 
@@ -698,6 +767,82 @@ void bio_unmap_user(struct bio *bio)
 	bio_put(bio);
 }
 
+static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bio_put(bio);
+	return 0;
+}
+
+
+static struct bio *__bio_map_kern(request_queue_t *q, void *data,
+				  unsigned int len, unsigned int gfp_mask)
+{
+	unsigned long kaddr = (unsigned long)data;
+	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long start = kaddr >> PAGE_SHIFT;
+	const int nr_pages = end - start;
+	int offset, i;
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_mask, nr_pages);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+
+	offset = offset_in_page(kaddr);
+	for (i = 0; i < nr_pages; i++) {
+		unsigned int bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		if (__bio_add_page(q, bio, virt_to_page(data), bytes,
+				   offset) < bytes)
+			break;
+
+		data += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	bio->bi_end_io = bio_map_kern_endio;
+	return bio;
+}
+
+/**
+ *	bio_map_kern	-	map kernel address into bio
+ *	@q: the request_queue_t for the bio
+ *	@data: pointer to buffer to map
+ *	@len: length in bytes
+ *	@gfp_mask: allocation flags for bio allocation
+ *
+ *	Map the kernel address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len,
+			 unsigned int gfp_mask)
+{
+	struct bio *bio;
+
+	bio = __bio_map_kern(q, data, len, gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
+
+	if (bio->bi_size == len)
+		return bio;
+
+	/*
+	 * Don't support partial mappings.
+	 */
+	bio_put(bio);
+	return ERR_PTR(-EINVAL);
+}
+
 /*
  * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
  * for performing direct-IO in BIOs.
@@ -1075,6 +1220,7 @@ subsys_initcall(init_bio);
 
 EXPORT_SYMBOL(bio_alloc);
 EXPORT_SYMBOL(bio_put);
+EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
 EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
@@ -1085,6 +1231,7 @@ EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
 EXPORT_SYMBOL(bio_map_user);
 EXPORT_SYMBOL(bio_unmap_user);
+EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
 EXPORT_SYMBOL(bio_split_pool);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6a25d7df89b..1c62203a490 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -917,8 +917,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				wait_on_buffer(bh);
-				ll_rw_block(WRITE, 1, &bh);
+				ll_rw_block(SWRITE, 1, &bh);
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -2793,21 +2792,22 @@ int submit_bh(int rw, struct buffer_head * bh)
 
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
- * ll_rw_block() takes an array of pointers to &struct buffer_heads,
- * and requests an I/O operation on them, either a %READ or a %WRITE.
- * The third %READA option is described in the documentation for
- * generic_make_request() which ll_rw_block() calls.
+ * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
+ * requests an I/O operation on them, either a %READ or a %WRITE.  The third
+ * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
+ * are sent to disk. The fourth %READA option is described in the documentation
+ * for generic_make_request() which ll_rw_block() calls.
  *
  * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit), any buffer that appears to be clean when doing a
- * write request, and any buffer that appears to be up-to-date when doing
- * read request.  Further it marks as clean buffers that are processed for
- * writing (the buffer cache won't assume that they are actually clean until
- * the buffer gets unlocked).
+ * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
+ * clean when doing a write request, and any buffer that appears to be
+ * up-to-date when doing read request.  Further it marks as clean buffers that
+ * are processed for writing (the buffer cache won't assume that they are
+ * actually clean until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2823,11 +2823,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (test_set_buffer_locked(bh))
+		if (rw == SWRITE)
+			lock_buffer(bh);
+		else if (test_set_buffer_locked(bh))
 			continue;
 
 		get_bh(bh);
-		if (rw == WRITE) {
+		if (rw == WRITE || rw == SWRITE) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				submit_bh(WRITE, bh);
@@ -3046,10 +3048,9 @@ struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
 {
 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
-		preempt_disable();
-		__get_cpu_var(bh_accounting).nr++;
+		get_cpu_var(bh_accounting).nr++;
 		recalc_bh_state();
-		preempt_enable();
+		put_cpu_var(bh_accounting);
 	}
 	return ret;
 }
@@ -3059,10 +3060,9 @@ void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
 	kmem_cache_free(bh_cachep, bh);
-	preempt_disable();
-	__get_cpu_var(bh_accounting).nr--;
+	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
-	preempt_enable();
+	put_cpu_var(bh_accounting);
 }
 EXPORT_SYMBOL(free_buffer_head);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e568cc47a7f..3217ac5f6bd 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -836,7 +836,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				/* go from value to value + temp_len condensing 
 				double commas to singles. Note that this ends up
 				allocating a few bytes too many, which is ok */
-				vol->password = kcalloc(1, temp_len, GFP_KERNEL);
+				vol->password = kzalloc(temp_len, GFP_KERNEL);
 				if(vol->password == NULL) {
 					printk("CIFS: no memory for pass\n");
 					return 1;
@@ -851,7 +851,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				}
 				vol->password[j] = 0;
 			} else {
-				vol->password = kcalloc(1, temp_len+1, GFP_KERNEL);
+				vol->password = kzalloc(temp_len+1, GFP_KERNEL);
 				if(vol->password == NULL) {
 					printk("CIFS: no memory for pass\n");
 					return 1;
@@ -1317,7 +1317,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 		sessinit is sent but no second negprot */
 		struct rfc1002_session_packet * ses_init_buf;
 		struct smb_hdr * smb_buf;
-		ses_init_buf = kcalloc(1, sizeof(struct rfc1002_session_packet), GFP_KERNEL);
+		ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), GFP_KERNEL);
 		if(ses_init_buf) {
 			ses_init_buf->trailer.session_req.called_len = 32;
 			rfc1002mangle(ses_init_buf->trailer.session_req.called_name,
@@ -1964,7 +1964,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
-				ses->serverOS = kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+				ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL);
 				if(ses->serverOS == NULL)
 					goto sesssetup_nomem;
 				cifs_strfromUCS_le(ses->serverOS,
@@ -1976,7 +1976,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (remaining_words > 0) {
 					len = UniStrnlen((wchar_t *)bcc_ptr,
 							 remaining_words-1);
-					ses->serverNOS = kcalloc(1, 2 * (len + 1),GFP_KERNEL);
+					ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
 					cifs_strfromUCS_le(ses->serverNOS,
@@ -1994,7 +1994,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
           /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
 						ses->serverDomain =
-						    kcalloc(1, 2*(len+1),GFP_KERNEL);
+						    kzalloc(2*(len+1),GFP_KERNEL);
 						if(ses->serverDomain == NULL)
 							goto sesssetup_nomem;
 						cifs_strfromUCS_le(ses->serverDomain,
@@ -2005,22 +2005,22 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					} /* else no more room so create dummy domain string */
 					else
 						ses->serverDomain = 
-							kcalloc(1, 2, GFP_KERNEL);
+							kzalloc(2, GFP_KERNEL);
 				} else {	/* no room so create dummy domain and NOS string */
 					/* if these kcallocs fail not much we
 					   can do, but better to not fail the
 					   sesssetup itself */
 					ses->serverDomain =
-					    kcalloc(1, 2, GFP_KERNEL);
+					    kzalloc(2, GFP_KERNEL);
 					ses->serverNOS =
-					    kcalloc(1, 2, GFP_KERNEL);
+					    kzalloc(2, GFP_KERNEL);
 				}
 			} else {	/* ASCII */
 				len = strnlen(bcc_ptr, 1024);
 				if (((long) bcc_ptr + len) - (long)
 				    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
-					ses->serverOS = kcalloc(1, len + 1,GFP_KERNEL);
+					ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverOS == NULL)
 						goto sesssetup_nomem;
 					strncpy(ses->serverOS,bcc_ptr, len);
@@ -2030,7 +2030,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
-					ses->serverNOS = kcalloc(1, len + 1,GFP_KERNEL);
+					ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
 					strncpy(ses->serverNOS, bcc_ptr, len);
@@ -2039,7 +2039,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
-					ses->serverDomain = kcalloc(1, len + 1,GFP_KERNEL);
+					ses->serverDomain = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverDomain == NULL)
 						goto sesssetup_nomem;
 					strncpy(ses->serverDomain, bcc_ptr, len);
@@ -2240,7 +2240,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
 					ses->serverOS =
-					    kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
 							   (wchar_t *)
 							   bcc_ptr, len,
@@ -2254,7 +2254,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 								 remaining_words
 								 - 1);
 						ses->serverNOS =
-						    kcalloc(1, 2 * (len + 1),
+						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
 						cifs_strfromUCS_le(ses->serverNOS,
 								   (wchar_t *)bcc_ptr,
@@ -2267,7 +2267,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
                             /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
-							ses->serverDomain = kcalloc(1, 2*(len+1),GFP_KERNEL);
+							ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
 							cifs_strfromUCS_le(ses->serverDomain,
 							     (wchar_t *)bcc_ptr, 
                                  len,
@@ -2278,10 +2278,10 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						} /* else no more room so create dummy domain string */
 						else
 							ses->serverDomain =
-							    kcalloc(1, 2,GFP_KERNEL);
+							    kzalloc(2,GFP_KERNEL);
 					} else {	/* no room so create dummy domain and NOS string */
-						ses->serverDomain = kcalloc(1, 2, GFP_KERNEL);
-						ses->serverNOS = kcalloc(1, 2, GFP_KERNEL);
+						ses->serverDomain = kzalloc(2, GFP_KERNEL);
+						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
 
@@ -2289,7 +2289,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (((long) bcc_ptr + len) - (long)
 					    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
-						ses->serverOS = kcalloc(1, len + 1, GFP_KERNEL);
+						ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverOS, bcc_ptr, len);
 
 						bcc_ptr += len;
@@ -2297,14 +2297,14 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						ses->serverNOS = kcalloc(1, len + 1,GFP_KERNEL);
+						ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);
 						bcc_ptr += len;
 						bcc_ptr[0] = 0;
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						ses->serverDomain = kcalloc(1, len + 1, GFP_KERNEL);
+						ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
 						bcc_ptr[0] = 0;
@@ -2554,7 +2554,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
 					ses->serverOS =
-					    kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
 							   (wchar_t *)
 							   bcc_ptr, len,
@@ -2569,7 +2569,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 								 remaining_words
 								 - 1);
 						ses->serverNOS =
-						    kcalloc(1, 2 * (len + 1),
+						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
 						cifs_strfromUCS_le(ses->
 								   serverNOS,
@@ -2586,7 +2586,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
 							ses->serverDomain =
-							    kcalloc(1, 2 *
+							    kzalloc(2 *
 								    (len +
 								     1),
 								    GFP_KERNEL);
@@ -2612,13 +2612,13 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						} /* else no more room so create dummy domain string */
 						else
 							ses->serverDomain =
-							    kcalloc(1, 2,
+							    kzalloc(2,
 								    GFP_KERNEL);
 					} else {	/* no room so create dummy domain and NOS string */
 						ses->serverDomain =
-						    kcalloc(1, 2, GFP_KERNEL);
+						    kzalloc(2, GFP_KERNEL);
 						ses->serverNOS =
-						    kcalloc(1, 2, GFP_KERNEL);
+						    kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
 					len = strnlen(bcc_ptr, 1024);
@@ -2626,7 +2626,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 					    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
 						ses->serverOS =
-						    kcalloc(1, len + 1,
+						    kzalloc(len + 1,
 							    GFP_KERNEL);
 						strncpy(ses->serverOS,
 							bcc_ptr, len);
@@ -2637,7 +2637,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 
 						len = strnlen(bcc_ptr, 1024);
 						ses->serverNOS =
-						    kcalloc(1, len + 1,
+						    kzalloc(len + 1,
 							    GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);
 						bcc_ptr += len;
@@ -2646,7 +2646,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 
 						len = strnlen(bcc_ptr, 1024);
 						ses->serverDomain =
-						    kcalloc(1, len + 1,
+						    kzalloc(len + 1,
 							    GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);	
 						bcc_ptr += len;
@@ -2948,7 +2948,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
 					ses->serverOS =
-					    kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
 							   (wchar_t *)
 							   bcc_ptr, len,
@@ -2963,7 +2963,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 								 remaining_words
 								 - 1);
 						ses->serverNOS =
-						    kcalloc(1, 2 * (len + 1),
+						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
 						cifs_strfromUCS_le(ses->
 								   serverNOS,
@@ -2979,7 +2979,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
      /* last string not always null terminated (e.g. for Windows XP & 2000) */
 							ses->serverDomain =
-							    kcalloc(1, 2 *
+							    kzalloc(2 *
 								    (len +
 								     1),
 								    GFP_KERNEL);
@@ -3004,17 +3004,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 							    = 0;
 						} /* else no more room so create dummy domain string */
 						else
-							ses->serverDomain = kcalloc(1, 2,GFP_KERNEL);
+							ses->serverDomain = kzalloc(2,GFP_KERNEL);
 					} else {  /* no room so create dummy domain and NOS string */
-						ses->serverDomain = kcalloc(1, 2, GFP_KERNEL);
-						ses->serverNOS = kcalloc(1, 2, GFP_KERNEL);
+						ses->serverDomain = kzalloc(2, GFP_KERNEL);
+						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
 					len = strnlen(bcc_ptr, 1024);
 					if (((long) bcc_ptr + len) - 
                         (long) pByteArea(smb_buffer_response) 
                             <= BCC(smb_buffer_response)) {
-						ses->serverOS = kcalloc(1, len + 1,GFP_KERNEL);
+						ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 						strncpy(ses->serverOS,bcc_ptr, len);
 
 						bcc_ptr += len;
@@ -3022,14 +3022,14 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						ses->serverNOS = kcalloc(1, len+1,GFP_KERNEL);
+						ses->serverNOS = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);	
 						bcc_ptr += len;
 						bcc_ptr[0] = 0;
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						ses->serverDomain = kcalloc(1, len+1,GFP_KERNEL);
+						ses->serverDomain = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
 						bcc_ptr[0] = 0;
@@ -3141,7 +3141,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 				if(tcon->nativeFileSystem)
 					kfree(tcon->nativeFileSystem);
 				tcon->nativeFileSystem =
-				    kcalloc(1, length + 2, GFP_KERNEL);
+				    kzalloc(length + 2, GFP_KERNEL);
 				cifs_strfromUCS_le(tcon->nativeFileSystem,
 						   (wchar_t *) bcc_ptr,
 						   length, nls_codepage);
@@ -3159,7 +3159,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 				if(tcon->nativeFileSystem)
 					kfree(tcon->nativeFileSystem);
 				tcon->nativeFileSystem =
-				    kcalloc(1, length + 1, GFP_KERNEL);
+				    kzalloc(length + 1, GFP_KERNEL);
 				strncpy(tcon->nativeFileSystem, bcc_ptr,
 					length);
 			}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3f3538d4a1f..d335269bd91 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -145,24 +145,23 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	if(nd) {
-		if ((nd->intent.open.flags & O_ACCMODE) == O_RDONLY)
-			desiredAccess = GENERIC_READ;
-		else if ((nd->intent.open.flags & O_ACCMODE) == O_WRONLY) {
-			desiredAccess = GENERIC_WRITE;
-			write_only = TRUE;
-		} else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) {
-			/* GENERIC_ALL is too much permission to request */
-			/* can cause unnecessary access denied on create */
-			/* desiredAccess = GENERIC_ALL; */
-			desiredAccess = GENERIC_READ | GENERIC_WRITE;
+	if(nd && (nd->flags & LOOKUP_OPEN)) {
+		int oflags = nd->intent.open.flags;
+
+		desiredAccess = 0;
+		if (oflags & FMODE_READ)
+			desiredAccess |= GENERIC_READ;
+		if (oflags & FMODE_WRITE) {
+			desiredAccess |= GENERIC_WRITE;
+			if (!(oflags & FMODE_READ))
+				write_only = TRUE;
 		}
 
-		if((nd->intent.open.flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		if((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 			disposition = FILE_CREATE;
-		else if((nd->intent.open.flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+		else if((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
 			disposition = FILE_OVERWRITE_IF;
-		else if((nd->intent.open.flags & O_CREAT) == O_CREAT)
+		else if((oflags & O_CREAT) == O_CREAT)
 			disposition = FILE_OPEN_IF;
 		else {
 			cFYI(1,("Create flag not set in create function"));
diff --git a/fs/compat.c b/fs/compat.c
index 6b06b6bae35..ac3fb9ed8ee 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -310,96 +310,6 @@ static int __init init_sys32_ioctl(void)
 
 __initcall(init_sys32_ioctl);
 
-int register_ioctl32_conversion(unsigned int cmd,
-				ioctl_trans_handler_t handler)
-{
-	struct ioctl_trans *t;
-	struct ioctl_trans *new_t;
-	unsigned long hash = ioctl32_hash(cmd);
-
-	new_t = kmalloc(sizeof(*new_t), GFP_KERNEL);
-	if (!new_t)
-		return -ENOMEM;
-
-	down_write(&ioctl32_sem);
-	for (t = ioctl32_hash_table[hash]; t; t = t->next) {
-		if (t->cmd == cmd) {
-			printk(KERN_ERR "Trying to register duplicated ioctl32 "
-					"handler %x\n", cmd);
-			up_write(&ioctl32_sem);
-			kfree(new_t);
-			return -EINVAL; 
-		}
-	}
-	new_t->next = NULL;
-	new_t->cmd = cmd;
-	new_t->handler = handler;
-	ioctl32_insert_translation(new_t);
-
-	up_write(&ioctl32_sem);
-	return 0;
-}
-EXPORT_SYMBOL(register_ioctl32_conversion);
-
-static inline int builtin_ioctl(struct ioctl_trans *t)
-{ 
-	return t >= ioctl_start && t < (ioctl_start + ioctl_table_size);
-} 
-
-/* Problem: 
-   This function cannot unregister duplicate ioctls, because they are not
-   unique.
-   When they happen we need to extend the prototype to pass the handler too. */
-
-int unregister_ioctl32_conversion(unsigned int cmd)
-{
-	unsigned long hash = ioctl32_hash(cmd);
-	struct ioctl_trans *t, *t1;
-
-	down_write(&ioctl32_sem);
-
-	t = ioctl32_hash_table[hash];
-	if (!t) { 
-		up_write(&ioctl32_sem);
-		return -EINVAL;
-	} 
-
-	if (t->cmd == cmd) { 
-		if (builtin_ioctl(t)) {
-			printk("%p tried to unregister builtin ioctl %x\n",
-			       __builtin_return_address(0), cmd);
-		} else { 
-			ioctl32_hash_table[hash] = t->next;
-			up_write(&ioctl32_sem);
-			kfree(t);
-			return 0;
-		}
-	} 
-	while (t->next) {
-		t1 = t->next;
-		if (t1->cmd == cmd) { 
-			if (builtin_ioctl(t1)) {
-				printk("%p tried to unregister builtin "
-					"ioctl %x\n",
-					__builtin_return_address(0), cmd);
-				goto out;
-			} else { 
-				t->next = t1->next;
-				up_write(&ioctl32_sem);
-				kfree(t1);
-				return 0;
-			}
-		}
-		t = t1;
-	}
-	printk(KERN_ERR "Trying to free unknown 32bit ioctl handler %x\n",
-				cmd);
-out:
-	up_write(&ioctl32_sem);
-	return -EINVAL;
-}
-EXPORT_SYMBOL(unregister_ioctl32_conversion); 
-
 static void compat_ioctl_error(struct file *filp, unsigned int fd,
 		unsigned int cmd, unsigned long arg)
 {
@@ -720,14 +630,14 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
 struct compat_ncp_mount_data {
 	compat_int_t version;
 	compat_uint_t ncp_fd;
-	compat_uid_t mounted_uid;
+	__compat_uid_t mounted_uid;
 	compat_pid_t wdog_pid;
 	unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
 	compat_uint_t time_out;
 	compat_uint_t retry_count;
 	compat_uint_t flags;
-	compat_uid_t uid;
-	compat_gid_t gid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
 	compat_mode_t file_mode;
 	compat_mode_t dir_mode;
 };
@@ -784,9 +694,9 @@ static void *do_ncp_super_data_conv(void *raw_data)
 
 struct compat_smb_mount_data {
 	compat_int_t version;
-	compat_uid_t mounted_uid;
-	compat_uid_t uid;
-	compat_gid_t gid;
+	__compat_uid_t mounted_uid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
 	compat_mode_t file_mode;
 	compat_mode_t dir_mode;
 };
@@ -1365,6 +1275,16 @@ out:
 }
 
 /*
+ * Exactly like fs/open.c:sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open(const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(filename, flags, mode);
+}
+
+/*
  * compat_count() counts the number of arguments/envelopes. It is basically
  * a copy of count() from fs/exec.c, except that it works with 32 bit argv
  * and envp pointers.
@@ -1699,6 +1619,7 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 	char *bits;
 	long timeout;
 	int size, max_fdset, ret = -EINVAL;
+	struct fdtable *fdt;
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -1724,7 +1645,10 @@ compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp
 		goto out_nofds;
 
 	/* max_fdset can increase, so grab it once to avoid race */
-	max_fdset = current->files->max_fdset;
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fdset = fdt->max_fdset;
+	rcu_read_unlock();
 	if (n > max_fdset)
 		n = max_fdset;
 
@@ -1808,8 +1732,8 @@ struct compat_nfsctl_export {
 	compat_dev_t	ex32_dev;
 	compat_ino_t	ex32_ino;
 	compat_int_t	ex32_flags;
-	compat_uid_t	ex32_anon_uid;
-	compat_gid_t	ex32_anon_gid;
+	__compat_uid_t	ex32_anon_uid;
+	__compat_gid_t	ex32_anon_gid;
 };
 
 struct compat_nfsctl_fdparm {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 155e612635f..e28a74203f3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -798,13 +798,16 @@ static int routing_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 		r = (void *) &r4;
 	}
 
-	if (ret)
-		return -EFAULT;
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
 
 	set_fs (KERNEL_DS);
 	ret = sys_ioctl (fd, cmd, (unsigned long) r);
 	set_fs (old_fs);
 
+out:
 	if (mysock)
 		sockfd_put(mysock);
 
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6c285efa200..7fe85415ae7 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -39,12 +39,47 @@ static DECLARE_MUTEX(read_mutex);
 #define CRAMINO(x)	((x)->offset?(x)->offset<<2:1)
 #define OFFSET(x)	((x)->i_ino)
 
-static struct inode *get_cramfs_inode(struct super_block *sb, struct cramfs_inode * cramfs_inode)
+
+static int cramfs_iget5_test(struct inode *inode, void *opaque)
+{
+	struct cramfs_inode *cramfs_inode = opaque;
+
+	if (inode->i_ino != CRAMINO(cramfs_inode))
+		return 0; /* does not match */
+
+	if (inode->i_ino != 1)
+		return 1;
+
+	/* all empty directories, char, block, pipe, and sock, share inode #1 */
+
+	if ((inode->i_mode != cramfs_inode->mode) ||
+	    (inode->i_gid != cramfs_inode->gid) ||
+	    (inode->i_uid != cramfs_inode->uid))
+		return 0; /* does not match */
+
+	if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
+	    (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
+		return 0; /* does not match */
+
+	return 1; /* matches */
+}
+
+static int cramfs_iget5_set(struct inode *inode, void *opaque)
+{
+	struct cramfs_inode *cramfs_inode = opaque;
+	inode->i_ino = CRAMINO(cramfs_inode);
+	return 0;
+}
+
+static struct inode *get_cramfs_inode(struct super_block *sb,
+				struct cramfs_inode * cramfs_inode)
 {
-	struct inode * inode = new_inode(sb);
+	struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
+					    cramfs_iget5_test, cramfs_iget5_set,
+					    cramfs_inode);
 	static struct timespec zerotime;
 
-	if (inode) {
+	if (inode && (inode->i_state & I_NEW)) {
 		inode->i_mode = cramfs_inode->mode;
 		inode->i_uid = cramfs_inode->uid;
 		inode->i_size = cramfs_inode->size;
@@ -58,7 +93,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct cramfs_inod
 		   but it's the best we can do without reading the directory
 	           contents.  1 yields the right result in GNU find, even
 		   without -noleaf option. */
-		insert_inode_hash(inode);
 		if (S_ISREG(inode->i_mode)) {
 			inode->i_fop = &generic_ro_fops;
 			inode->i_data.a_ops = &cramfs_aops;
@@ -74,6 +108,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct cramfs_inod
 			init_special_inode(inode, inode->i_mode,
 				old_decode_dev(cramfs_inode->size));
 		}
+		unlock_new_inode(inode);
 	}
 	return inode;
 }
diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile
index 5800df2e50c..236696efcba 100644
--- a/fs/devpts/Makefile
+++ b/fs/devpts/Makefile
@@ -5,4 +5,3 @@
 obj-$(CONFIG_UNIX98_PTYS)		+= devpts.o
 
 devpts-$(CONFIG_UNIX98_PTYS)		:= inode.o
-devpts-$(CONFIG_DEVPTS_FS_SECURITY)	+= xattr_security.o
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1571c8d6c23..f2be44d4491 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,28 +18,9 @@
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/devpts_fs.h>
-#include <linux/xattr.h>
 
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
-extern struct xattr_handler devpts_xattr_security_handler;
-
-static struct xattr_handler *devpts_xattr_handlers[] = {
-#ifdef CONFIG_DEVPTS_FS_SECURITY
-	&devpts_xattr_security_handler,
-#endif
-	NULL
-};
-
-static struct inode_operations devpts_file_inode_operations = {
-#ifdef CONFIG_DEVPTS_FS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= generic_listxattr,
-	.removexattr	= generic_removexattr,
-#endif
-};
-
 static struct vfsmount *devpts_mnt;
 static struct dentry *devpts_root;
 
@@ -102,7 +83,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
 	s->s_op = &devpts_sops;
-	s->s_xattr = devpts_xattr_handlers;
 	s->s_time_gran = 1;
 
 	inode = new_inode(s);
@@ -175,7 +155,6 @@ int devpts_pty_new(struct tty_struct *tty)
 	inode->i_gid = config.setgid ? config.gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|config.mode, device);
-	inode->i_op = &devpts_file_inode_operations;
 	inode->u.generic_ip = tty;
 
 	dentry = get_node(number);
diff --git a/fs/devpts/xattr_security.c b/fs/devpts/xattr_security.c
deleted file mode 100644
index 864cb5c79ba..00000000000
--- a/fs/devpts/xattr_security.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Security xattr support for devpts.
- *
- * Author: Stephen Smalley <sds@epoch.ncsc.mil>
- * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/security.h>
-#include <linux/xattr.h>
-
-static size_t
-devpts_xattr_security_list(struct inode *inode, char *list, size_t list_len,
-			   const char *name, size_t name_len)
-{
-	return security_inode_listsecurity(inode, list, list_len);
-}
-
-static int
-devpts_xattr_security_get(struct inode *inode, const char *name,
-			  void *buffer, size_t size)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_getsecurity(inode, name, buffer, size);
-}
-
-static int
-devpts_xattr_security_set(struct inode *inode, const char *name,
-			  const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_setsecurity(inode, name, value, size, flags);
-}
-
-struct xattr_handler devpts_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= devpts_xattr_security_list,
-	.get	= devpts_xattr_security_get,
-	.set	= devpts_xattr_security_set,
-};
diff --git a/fs/exec.c b/fs/exec.c
index 222ab1c572d..14dd03907cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -798,6 +798,7 @@ no_thread_group:
 static inline void flush_old_files(struct files_struct * files)
 {
 	long j = -1;
+	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
 	for (;;) {
@@ -805,12 +806,13 @@ static inline void flush_old_files(struct files_struct * files)
 
 		j++;
 		i = j * __NFDBITS;
-		if (i >= files->max_fds || i >= files->max_fdset)
+		fdt = files_fdtable(files);
+		if (i >= fdt->max_fds || i >= fdt->max_fdset)
 			break;
-		set = files->close_on_exec->fds_bits[j];
+		set = fdt->close_on_exec->fds_bits[j];
 		if (!set)
 			continue;
-		files->close_on_exec->fds_bits[j] = 0;
+		fdt->close_on_exec->fds_bits[j] = 0;
 		spin_unlock(&files->file_lock);
 		for ( ; set ; i++,set >>= 1) {
 			if (set & 1) {
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 161f156d98c..c8d07030c89 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -615,6 +615,11 @@ got:
 		DQUOT_DROP(inode);
 		goto fail2;
 	}
+	err = ext2_init_security(inode,dir);
+	if (err) {
+		DQUOT_FREE_INODE(inode);
+		goto fail2;
+	}
 	mark_inode_dirty(inode);
 	ext2_debug("allocating inode %lu\n", inode->i_ino);
 	ext2_preread_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 53dceb0c659..fdba4d1d3c6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -71,6 +71,8 @@ void ext2_put_inode(struct inode *inode)
  */
 void ext2_delete_inode (struct inode * inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (is_bad_inode(inode))
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index dcfe331dc4c..3c0c7c6a5b4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -19,6 +19,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
@@ -27,6 +28,8 @@
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -201,6 +204,26 @@ static void ext2_clear_inode(struct inode *inode)
 #endif
 }
 
+static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(vfs->mnt_sb);
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_GRPID)
+		seq_puts(seq, ",grpid");
+	else
+		seq_puts(seq, ",nogrpid");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
 #ifdef CONFIG_QUOTA
 static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
 static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
@@ -218,6 +241,7 @@ static struct super_operations ext2_sops = {
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.clear_inode	= ext2_clear_inode,
+	.show_options	= ext2_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext2_quota_read,
 	.quota_write	= ext2_quota_write,
@@ -256,10 +280,11 @@ static unsigned long get_sb_block(void **data)
 
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
-	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh,
-	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_xip,
-	Opt_ignore, Opt_err,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
+	Opt_err_ro, Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug,
+	Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
+	Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota
 };
 
 static match_table_t tokens = {
@@ -288,10 +313,10 @@ static match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_xip, "xip"},
-	{Opt_ignore, "grpquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_ignore, "noquota"},
-	{Opt_ignore, "quota"},
-	{Opt_ignore, "usrquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
 	{Opt_err, NULL}
 };
 
@@ -406,6 +431,26 @@ static int parse_options (char * options,
 			printk("EXT2 xip option not supported\n");
 #endif
 			break;
+
+#if defined(CONFIG_QUOTA)
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			printk(KERN_ERR
+				"EXT2-fs: quota operations not supported.\n");
+
+			break;
+#endif
+
 		case Opt_ignore:
 			break;
 		default:
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5f3bfde3b81..67cfeb66e89 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,3 +116,11 @@ exit_ext2_xattr(void)
 
 # endif  /* CONFIG_EXT2_FS_XATTR */
 
+#ifdef CONFIG_EXT2_FS_SECURITY
+extern int ext2_init_security(struct inode *inode, struct inode *dir);
+#else
+static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 6a6c59fbe59..a2661279847 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
 #include <linux/ext2_fs.h>
+#include <linux/security.h>
 #include "xattr.h"
 
 static size_t
@@ -45,6 +46,27 @@ ext2_xattr_security_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
+int
+ext2_init_security(struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
+			     name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
 struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext2_xattr_security_list,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 6981bd014ed..96552769d03 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -607,6 +607,11 @@ got:
 		DQUOT_DROP(inode);
 		goto fail2;
   	}
+	err = ext3_init_security(handle,inode, dir);
+	if (err) {
+		DQUOT_FREE_INODE(inode);
+		goto fail2;
+	}
 	err = ext3_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext3_std_error(sb, err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9989fdcf4d5..b5177c90d6f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -187,6 +187,8 @@ void ext3_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (is_bad_inode(inode))
 		goto no_delete;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c3c6e399fb..a93c3609025 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "xattr.h"
 #include "acl.h"
@@ -509,8 +510,41 @@ static void ext3_clear_inode(struct inode *inode)
 	kfree(rsv);
 }
 
-#ifdef CONFIG_QUOTA
+static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(vfs->mnt_sb);
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_JOURNAL_DATA)
+		seq_puts(seq, ",data=journal");
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_ORDERED_DATA)
+		seq_puts(seq, ",data=ordered");
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_WRITEBACK_DATA)
+		seq_puts(seq, ",data=writeback");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->s_jquota_fmt)
+		seq_printf(seq, ",jqfmt=%s",
+		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 
+	if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
@@ -569,6 +603,7 @@ static struct super_operations ext3_sops = {
 	.statfs		= ext3_statfs,
 	.remount_fs	= ext3_remount,
 	.clear_inode	= ext3_clear_inode,
+	.show_options	= ext3_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext3_quota_read,
 	.quota_write	= ext3_quota_write,
@@ -590,7 +625,8 @@ enum {
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+	Opt_grpquota
 };
 
 static match_table_t tokens = {
@@ -634,10 +670,10 @@ static match_table_t tokens = {
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
-	{Opt_quota, "grpquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_noquota, "noquota"},
 	{Opt_quota, "quota"},
-	{Opt_quota, "usrquota"},
+	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
@@ -903,7 +939,13 @@ clear_qf_name:
 			sbi->s_jquota_fmt = QFMT_VFS_V0;
 			break;
 		case Opt_quota:
+		case Opt_usrquota:
 			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_enabled(sb)) {
@@ -912,8 +954,13 @@ clear_qf_name:
 				return 0;
 			}
 			clear_opt(sbi->s_mount_opt, QUOTA);
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 #else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
 		case Opt_usrjquota:
 		case Opt_grpjquota:
 		case Opt_offusrjquota:
@@ -924,7 +971,6 @@ clear_qf_name:
 				"EXT3-fs: journalled quota options not "
 				"supported.\n");
 			break;
-		case Opt_quota:
 		case Opt_noquota:
 			break;
 #endif
@@ -962,14 +1008,38 @@ clear_qf_name:
 		}
 	}
 #ifdef CONFIG_QUOTA
-	if (!sbi->s_jquota_fmt && (sbi->s_qf_names[USRQUOTA] ||
-	    sbi->s_qf_names[GRPQUOTA])) {
-		printk(KERN_ERR
-			"EXT3-fs: journalled quota format not specified.\n");
-		return 0;
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
+		     sbi->s_qf_names[USRQUOTA])
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+
+		if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
+		     sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+
+		if ((sbi->s_qf_names[USRQUOTA] &&
+				(sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
+		    (sbi->s_qf_names[GRPQUOTA] &&
+				(sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
+			printk(KERN_ERR "EXT3-fs: old and new quota "
+					"format mixing.\n");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			printk(KERN_ERR "EXT3-fs: journalled quota format "
+					"not specified.\n");
+			return 0;
+		}
+	} else {
+		if (sbi->s_jquota_fmt) {
+			printk(KERN_ERR "EXT3-fs: journalled quota format "
+					"specified with no journalling "
+					"enabled.\n");
+			return 0;
+		}
 	}
 #endif
-
 	return 1;
 }
 
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index eb31a69e82d..2ceae38f3d4 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -133,3 +133,14 @@ exit_ext3_xattr(void)
 #define ext3_xattr_handlers	NULL
 
 # endif  /* CONFIG_EXT3_FS_XATTR */
+
+#ifdef CONFIG_EXT3_FS_SECURITY
+extern int ext3_init_security(handle_t *handle, struct inode *inode,
+				struct inode *dir);
+#else
+static inline int ext3_init_security(handle_t *handle, struct inode *inode,
+				struct inode *dir)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index ddc1c41750e..b9c40c15647 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -9,6 +9,7 @@
 #include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
+#include <linux/security.h>
 #include "xattr.h"
 
 static size_t
@@ -47,6 +48,27 @@ ext3_xattr_security_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
+				    name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
 struct xattr_handler ext3_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext3_xattr_security_list,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index e5ae1b720dd..895049b2ac9 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -30,6 +30,29 @@ static inline loff_t fat_make_i_pos(struct super_block *sb,
 		| (de - (struct msdos_dir_entry *)bh->b_data);
 }
 
+static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
+				     sector_t phys)
+{
+	struct super_block *sb = dir->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct buffer_head *bh;
+	int sec;
+
+	/* This is not a first sector of cluster, or sec_per_clus == 1 */
+	if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
+		return;
+	/* root dir of FAT12/FAT16 */
+	if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
+		return;
+
+	bh = sb_getblk(sb, phys);
+	if (bh && !buffer_uptodate(bh)) {
+		for (sec = 0; sec < sbi->sec_per_clus; sec++)
+			sb_breadahead(sb, phys + sec);
+	}
+	brelse(bh);
+}
+
 /* Returns the inode number of the directory entry at offset pos. If bh is
    non-NULL, it is brelse'd before. Pos is incremented. The buffer header is
    returned in bh.
@@ -58,6 +81,8 @@ next:
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
+	fat_dir_readahead(dir, iblock, phys);
+
 	*bh = sb_bread(sb, phys);
 	if (*bh == NULL) {
 		printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
@@ -635,8 +660,7 @@ RecEnd:
 EODir:
 	filp->f_pos = cpos;
 FillFailed:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	if (unicode)
 		free_page((unsigned long)unicode);
 out:
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 96ae85b67eb..a7cbe68e225 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -335,6 +335,8 @@ EXPORT_SYMBOL(fat_build_inode);
 
 static void fat_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (!is_bad_inode(inode)) {
 		inode->i_size = 0;
 		fat_truncate(inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6fbc9d8fcc3..863b46e0d78 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
+#include <linux/rcupdate.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -24,21 +25,25 @@
 void fastcall set_close_on_exec(unsigned int fd, int flag)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
 	if (flag)
-		FD_SET(fd, files->close_on_exec);
+		FD_SET(fd, fdt->close_on_exec);
 	else
-		FD_CLR(fd, files->close_on_exec);
+		FD_CLR(fd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 }
 
 static inline int get_close_on_exec(unsigned int fd)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 	int res;
-	spin_lock(&files->file_lock);
-	res = FD_ISSET(fd, files->close_on_exec);
-	spin_unlock(&files->file_lock);
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	res = FD_ISSET(fd, fdt->close_on_exec);
+	rcu_read_unlock();
 	return res;
 }
 
@@ -54,24 +59,26 @@ static int locate_fd(struct files_struct *files,
 	unsigned int newfd;
 	unsigned int start;
 	int error;
+	struct fdtable *fdt;
 
 	error = -EINVAL;
 	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 repeat:
+	fdt = files_fdtable(files);
 	/*
 	 * Someone might have closed fd's in the range
-	 * orig_start..files->next_fd
+	 * orig_start..fdt->next_fd
 	 */
 	start = orig_start;
-	if (start < files->next_fd)
-		start = files->next_fd;
+	if (start < fdt->next_fd)
+		start = fdt->next_fd;
 
 	newfd = start;
-	if (start < files->max_fdset) {
-		newfd = find_next_zero_bit(files->open_fds->fds_bits,
-			files->max_fdset, start);
+	if (start < fdt->max_fdset) {
+		newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
+			fdt->max_fdset, start);
 	}
 	
 	error = -EMFILE;
@@ -89,9 +96,15 @@ repeat:
 	if (error)
 		goto repeat;
 
-	if (start <= files->next_fd)
-		files->next_fd = newfd + 1;
-	
+	/*
+	 * We reacquired files_lock, so we are safe as long as
+	 * we reacquire the fdtable pointer and use it while holding
+	 * the lock, no one can free it during that time.
+	 */
+	fdt = files_fdtable(files);
+	if (start <= fdt->next_fd)
+		fdt->next_fd = newfd + 1;
+
 	error = newfd;
 	
 out:
@@ -101,13 +114,16 @@ out:
 static int dupfd(struct file *file, unsigned int start)
 {
 	struct files_struct * files = current->files;
+	struct fdtable *fdt;
 	int fd;
 
 	spin_lock(&files->file_lock);
 	fd = locate_fd(files, file, start);
 	if (fd >= 0) {
-		FD_SET(fd, files->open_fds);
-		FD_CLR(fd, files->close_on_exec);
+		/* locate_fd() may have expanded fdtable, load the ptr */
+		fdt = files_fdtable(files);
+		FD_SET(fd, fdt->open_fds);
+		FD_CLR(fd, fdt->close_on_exec);
 		spin_unlock(&files->file_lock);
 		fd_install(fd, file);
 	} else {
@@ -123,6 +139,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	int err = -EBADF;
 	struct file * file, *tofree;
 	struct files_struct * files = current->files;
+	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
 	if (!(file = fcheck(oldfd)))
@@ -148,13 +165,14 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 
 	/* Yes. It's a race. In user space. Nothing sane to do */
 	err = -EBUSY;
-	tofree = files->fd[newfd];
-	if (!tofree && FD_ISSET(newfd, files->open_fds))
+	fdt = files_fdtable(files);
+	tofree = fdt->fd[newfd];
+	if (!tofree && FD_ISSET(newfd, fdt->open_fds))
 		goto out_fput;
 
-	files->fd[newfd] = file;
-	FD_SET(newfd, files->open_fds);
-	FD_CLR(newfd, files->close_on_exec);
+	rcu_assign_pointer(fdt->fd[newfd], file);
+	FD_SET(newfd, fdt->open_fds);
+	FD_CLR(newfd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 
 	if (tofree)
diff --git a/fs/file.c b/fs/file.c
index 92b5f25985d..2127a7b9dc3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -13,6 +13,25 @@
 #include <linux/vmalloc.h>
 #include <linux/file.h>
 #include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+
+struct fdtable_defer {
+	spinlock_t lock;
+	struct work_struct wq;
+	struct timer_list timer;
+	struct fdtable *next;
+};
+
+/*
+ * We use this list to defer free fdtables that have vmalloced
+ * sets/arrays. By keeping a per-cpu list, we avoid having to embed
+ * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
+ * this per-task structure.
+ */
+static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
 
 
 /*
@@ -48,82 +67,143 @@ void free_fd_array(struct file **array, int num)
 		vfree(array);
 }
 
-/*
- * Expand the fd array in the files_struct.  Called with the files
- * spinlock held for write.
- */
+static void __free_fdtable(struct fdtable *fdt)
+{
+	int fdset_size, fdarray_size;
 
-static int expand_fd_array(struct files_struct *files, int nr)
-	__releases(files->file_lock)
-	__acquires(files->file_lock)
+	fdset_size = fdt->max_fdset / 8;
+	fdarray_size = fdt->max_fds * sizeof(struct file *);
+	free_fdset(fdt->open_fds, fdset_size);
+	free_fdset(fdt->close_on_exec, fdset_size);
+	free_fd_array(fdt->fd, fdarray_size);
+	kfree(fdt);
+}
+
+static void fdtable_timer(unsigned long data)
 {
-	struct file **new_fds;
-	int error, nfds;
+	struct fdtable_defer *fddef = (struct fdtable_defer *)data;
 
-	
-	error = -EMFILE;
-	if (files->max_fds >= NR_OPEN || nr >= NR_OPEN)
+	spin_lock(&fddef->lock);
+	/*
+	 * If someone already emptied the queue return.
+	 */
+	if (!fddef->next)
 		goto out;
+	if (!schedule_work(&fddef->wq))
+		mod_timer(&fddef->timer, 5);
+out:
+	spin_unlock(&fddef->lock);
+}
 
-	nfds = files->max_fds;
-	spin_unlock(&files->file_lock);
+static void free_fdtable_work(struct fdtable_defer *f)
+{
+	struct fdtable *fdt;
 
-	/* 
-	 * Expand to the max in easy steps, and keep expanding it until
-	 * we have enough for the requested fd array size. 
-	 */
+	spin_lock_bh(&f->lock);
+	fdt = f->next;
+	f->next = NULL;
+	spin_unlock_bh(&f->lock);
+	while(fdt) {
+		struct fdtable *next = fdt->next;
+		__free_fdtable(fdt);
+		fdt = next;
+	}
+}
 
-	do {
-#if NR_OPEN_DEFAULT < 256
-		if (nfds < 256)
-			nfds = 256;
-		else 
-#endif
-		if (nfds < (PAGE_SIZE / sizeof(struct file *)))
-			nfds = PAGE_SIZE / sizeof(struct file *);
-		else {
-			nfds = nfds * 2;
-			if (nfds > NR_OPEN)
-				nfds = NR_OPEN;
-		}
-	} while (nfds <= nr);
+static void free_fdtable_rcu(struct rcu_head *rcu)
+{
+	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
+	int fdset_size, fdarray_size;
+	struct fdtable_defer *fddef;
 
-	error = -ENOMEM;
-	new_fds = alloc_fd_array(nfds);
-	spin_lock(&files->file_lock);
-	if (!new_fds)
-		goto out;
+	BUG_ON(!fdt);
+	fdset_size = fdt->max_fdset / 8;
+	fdarray_size = fdt->max_fds * sizeof(struct file *);
 
-	/* Copy the existing array and install the new pointer */
-
-	if (nfds > files->max_fds) {
-		struct file **old_fds;
-		int i;
-		
-		old_fds = xchg(&files->fd, new_fds);
-		i = xchg(&files->max_fds, nfds);
-
-		/* Don't copy/clear the array if we are creating a new
-		   fd array for fork() */
-		if (i) {
-			memcpy(new_fds, old_fds, i * sizeof(struct file *));
-			/* clear the remainder of the array */
-			memset(&new_fds[i], 0,
-			       (nfds-i) * sizeof(struct file *)); 
-
-			spin_unlock(&files->file_lock);
-			free_fd_array(old_fds, i);
-			spin_lock(&files->file_lock);
-		}
+	if (fdt->free_files) {
+		/*
+		 * The this fdtable was embedded in the files structure
+		 * and the files structure itself was getting destroyed.
+		 * It is now safe to free the files structure.
+		 */
+		kmem_cache_free(files_cachep, fdt->free_files);
+		return;
+	}
+	if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+		/*
+		 * The fdtable was embedded
+		 */
+		return;
+	}
+	if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
+		kfree(fdt->open_fds);
+		kfree(fdt->close_on_exec);
+		kfree(fdt->fd);
+		kfree(fdt);
 	} else {
-		/* Somebody expanded the array while we slept ... */
-		spin_unlock(&files->file_lock);
-		free_fd_array(new_fds, nfds);
-		spin_lock(&files->file_lock);
+		fddef = &get_cpu_var(fdtable_defer_list);
+		spin_lock(&fddef->lock);
+		fdt->next = fddef->next;
+		fddef->next = fdt;
+		/*
+		 * vmallocs are handled from the workqueue context.
+		 * If the per-cpu workqueue is running, then we
+		 * defer work scheduling through a timer.
+		 */
+		if (!schedule_work(&fddef->wq))
+			mod_timer(&fddef->timer, 5);
+		spin_unlock(&fddef->lock);
+		put_cpu_var(fdtable_defer_list);
 	}
-	error = 0;
-out:
-	return error;
+}
+
+void free_fdtable(struct fdtable *fdt)
+{
+	if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
+					fdt->max_fds > NR_OPEN_DEFAULT)
+		call_rcu(&fdt->rcu, free_fdtable_rcu);
+}
+
+/*
+ * Expand the fdset in the files_struct.  Called with the files spinlock
+ * held for write.
+ */
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
+{
+	int i;
+	int count;
+
+	BUG_ON(nfdt->max_fdset < fdt->max_fdset);
+	BUG_ON(nfdt->max_fds < fdt->max_fds);
+	/* Copy the existing tables and install the new pointers */
+
+	i = fdt->max_fdset / (sizeof(unsigned long) * 8);
+	count = (nfdt->max_fdset - fdt->max_fdset) / 8;
+
+	/*
+	 * Don't copy the entire array if the current fdset is
+	 * not yet initialised.
+	 */
+	if (i) {
+		memcpy (nfdt->open_fds, fdt->open_fds,
+						fdt->max_fdset/8);
+		memcpy (nfdt->close_on_exec, fdt->close_on_exec,
+						fdt->max_fdset/8);
+		memset (&nfdt->open_fds->fds_bits[i], 0, count);
+		memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
+	}
+
+	/* Don't copy/clear the array if we are creating a new
+	   fd array for fork() */
+	if (fdt->max_fds) {
+		memcpy(nfdt->fd, fdt->fd,
+			fdt->max_fds * sizeof(struct file *));
+		/* clear the remainder of the array */
+		memset(&nfdt->fd[fdt->max_fds], 0,
+		       (nfdt->max_fds - fdt->max_fds) *
+					sizeof(struct file *));
+	}
+	nfdt->next_fd = fdt->next_fd;
 }
 
 /*
@@ -154,26 +234,21 @@ void free_fdset(fd_set *array, int num)
 		vfree(array);
 }
 
-/*
- * Expand the fdset in the files_struct.  Called with the files spinlock
- * held for write.
- */
-static int expand_fdset(struct files_struct *files, int nr)
-	__releases(file->file_lock)
-	__acquires(file->file_lock)
+static struct fdtable *alloc_fdtable(int nr)
 {
-	fd_set *new_openset = NULL, *new_execset = NULL;
-	int error, nfds = 0;
-
-	error = -EMFILE;
-	if (files->max_fdset >= NR_OPEN || nr >= NR_OPEN)
-		goto out;
+	struct fdtable *fdt = NULL;
+	int nfds = 0;
+  	fd_set *new_openset = NULL, *new_execset = NULL;
+	struct file **new_fds;
 
-	nfds = files->max_fdset;
-	spin_unlock(&files->file_lock);
+	fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
+	if (!fdt)
+  		goto out;
+	memset(fdt, 0, sizeof(*fdt));
 
-	/* Expand to the max in easy steps */
-	do {
+	nfds = __FD_SETSIZE;
+  	/* Expand to the max in easy steps */
+  	do {
 		if (nfds < (PAGE_SIZE * 8))
 			nfds = PAGE_SIZE * 8;
 		else {
@@ -183,49 +258,88 @@ static int expand_fdset(struct files_struct *files, int nr)
 		}
 	} while (nfds <= nr);
 
-	error = -ENOMEM;
-	new_openset = alloc_fdset(nfds);
-	new_execset = alloc_fdset(nfds);
-	spin_lock(&files->file_lock);
-	if (!new_openset || !new_execset)
+  	new_openset = alloc_fdset(nfds);
+  	new_execset = alloc_fdset(nfds);
+  	if (!new_openset || !new_execset)
+  		goto out;
+	fdt->open_fds = new_openset;
+	fdt->close_on_exec = new_execset;
+	fdt->max_fdset = nfds;
+
+	nfds = NR_OPEN_DEFAULT;
+	/*
+	 * Expand to the max in easy steps, and keep expanding it until
+	 * we have enough for the requested fd array size.
+	 */
+	do {
+#if NR_OPEN_DEFAULT < 256
+		if (nfds < 256)
+			nfds = 256;
+		else
+#endif
+		if (nfds < (PAGE_SIZE / sizeof(struct file *)))
+			nfds = PAGE_SIZE / sizeof(struct file *);
+		else {
+			nfds = nfds * 2;
+			if (nfds > NR_OPEN)
+				nfds = NR_OPEN;
+  		}
+	} while (nfds <= nr);
+	new_fds = alloc_fd_array(nfds);
+	if (!new_fds)
+		goto out;
+	fdt->fd = new_fds;
+	fdt->max_fds = nfds;
+	fdt->free_files = NULL;
+	return fdt;
+out:
+  	if (new_openset)
+  		free_fdset(new_openset, nfds);
+  	if (new_execset)
+  		free_fdset(new_execset, nfds);
+	kfree(fdt);
+	return NULL;
+}
+
+/*
+ * Expands the file descriptor table - it will allocate a new fdtable and
+ * both fd array and fdset. It is expected to be called with the
+ * files_lock held.
+ */
+static int expand_fdtable(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
+{
+	int error = 0;
+	struct fdtable *fdt;
+	struct fdtable *nfdt = NULL;
+
+	spin_unlock(&files->file_lock);
+	nfdt = alloc_fdtable(nr);
+	if (!nfdt) {
+		error = -ENOMEM;
+		spin_lock(&files->file_lock);
 		goto out;
+	}
 
-	error = 0;
-	
-	/* Copy the existing tables and install the new pointers */
-	if (nfds > files->max_fdset) {
-		int i = files->max_fdset / (sizeof(unsigned long) * 8);
-		int count = (nfds - files->max_fdset) / 8;
-		
-		/* 
-		 * Don't copy the entire array if the current fdset is
-		 * not yet initialised.  
-		 */
-		if (i) {
-			memcpy (new_openset, files->open_fds, files->max_fdset/8);
-			memcpy (new_execset, files->close_on_exec, files->max_fdset/8);
-			memset (&new_openset->fds_bits[i], 0, count);
-			memset (&new_execset->fds_bits[i], 0, count);
-		}
-		
-		nfds = xchg(&files->max_fdset, nfds);
-		new_openset = xchg(&files->open_fds, new_openset);
-		new_execset = xchg(&files->close_on_exec, new_execset);
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	/*
+	 * Check again since another task may have expanded the
+	 * fd table while we dropped the lock
+	 */
+	if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
+		copy_fdtable(nfdt, fdt);
+	} else {
+		/* Somebody expanded while we dropped file_lock */
 		spin_unlock(&files->file_lock);
-		free_fdset (new_openset, nfds);
-		free_fdset (new_execset, nfds);
+		__free_fdtable(nfdt);
 		spin_lock(&files->file_lock);
-		return 0;
-	} 
-	/* Somebody expanded the array while we slept ... */
-
+		goto out;
+	}
+	rcu_assign_pointer(files->fdt, nfdt);
+	free_fdtable(fdt);
 out:
-	spin_unlock(&files->file_lock);
-	if (new_openset)
-		free_fdset(new_openset, nfds);
-	if (new_execset)
-		free_fdset(new_execset, nfds);
-	spin_lock(&files->file_lock);
 	return error;
 }
 
@@ -237,18 +351,39 @@ out:
 int expand_files(struct files_struct *files, int nr)
 {
 	int err, expand = 0;
+	struct fdtable *fdt;
 
-	if (nr >= files->max_fdset) {
-		expand = 1;
-		if ((err = expand_fdset(files, nr)))
+	fdt = files_fdtable(files);
+	if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
+		if (fdt->max_fdset >= NR_OPEN ||
+			fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
+			err = -EMFILE;
 			goto out;
-	}
-	if (nr >= files->max_fds) {
+		}
 		expand = 1;
-		if ((err = expand_fd_array(files, nr)))
+		if ((err = expand_fdtable(files, nr)))
 			goto out;
 	}
 	err = expand;
 out:
 	return err;
 }
+
+static void __devinit fdtable_defer_list_init(int cpu)
+{
+	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
+	spin_lock_init(&fddef->lock);
+	INIT_WORK(&fddef->wq, (void (*)(void *))free_fdtable_work, fddef);
+	init_timer(&fddef->timer);
+	fddef->timer.data = (unsigned long)fddef;
+	fddef->timer.function = fdtable_timer;
+	fddef->next = NULL;
+}
+
+void __init files_defer_init(void)
+{
+	int i;
+	/* Really early - can't use for_each_cpu */
+	for (i = 0; i < NR_CPUS; i++)
+		fdtable_defer_list_init(i);
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index 1d3de78e6bc..86ec8ae985b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/eventpoll.h>
+#include <linux/rcupdate.h>
 #include <linux/mount.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
@@ -53,11 +54,17 @@ void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
 	spin_unlock_irqrestore(&filp_count_lock, flags);
 }
 
-static inline void file_free(struct file *f)
+static inline void file_free_rcu(struct rcu_head *head)
 {
+	struct file *f =  container_of(head, struct file, f_rcuhead);
 	kmem_cache_free(filp_cachep, f);
 }
 
+static inline void file_free(struct file *f)
+{
+	call_rcu(&f->f_rcuhead, file_free_rcu);
+}
+
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
  * we run out of memory.
@@ -89,7 +96,6 @@ struct file *get_empty_filp(void)
 	rwlock_init(&f->f_owner.lock);
 	/* f->f_version: 0 */
 	INIT_LIST_HEAD(&f->f_list);
-	f->f_maxcount = INT_MAX;
 	return f;
 
 over:
@@ -111,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 
 void fastcall fput(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count))
+	if (rcuref_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -157,11 +163,17 @@ struct file fastcall *fget(unsigned int fd)
 	struct file *file;
 	struct files_struct *files = current->files;
 
-	spin_lock(&files->file_lock);
+	rcu_read_lock();
 	file = fcheck_files(files, fd);
-	if (file)
-		get_file(file);
-	spin_unlock(&files->file_lock);
+	if (file) {
+		if (!rcuref_inc_lf(&file->f_count)) {
+			/* File object ref couldn't be taken */
+			rcu_read_unlock();
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+
 	return file;
 }
 
@@ -183,21 +195,25 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 	if (likely((atomic_read(&files->count) == 1))) {
 		file = fcheck_files(files, fd);
 	} else {
-		spin_lock(&files->file_lock);
+		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			get_file(file);
-			*fput_needed = 1;
+			if (rcuref_inc_lf(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
 		}
-		spin_unlock(&files->file_lock);
+		rcu_read_unlock();
 	}
+
 	return file;
 }
 
 
 void put_filp(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count)) {
+	if (rcuref_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
@@ -258,4 +274,5 @@ void __init files_init(unsigned long mempages)
 	files_stat.max_files = n; 
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
+	files_defer_init();
 } 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 27f66d3e8a0..6aa6fbe4f8e 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -155,7 +155,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 
 	sbp->s_flags |= MS_RDONLY;
 
-	infp = kcalloc(1, sizeof(*infp), GFP_KERNEL);
+	infp = kzalloc(sizeof(*infp), GFP_KERNEL);
 	if (!infp) {
 		printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
 		return -ENOMEM;
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
new file mode 100644
index 00000000000..c3e1f760cac
--- /dev/null
+++ b/fs/fuse/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the FUSE filesystem.
+#
+
+obj-$(CONFIG_FUSE_FS) += fuse.o
+
+fuse-objs := dev.o dir.o file.o inode.o
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
new file mode 100644
index 00000000000..d4c869c6d01
--- /dev/null
+++ b/fs/fuse/dev.c
@@ -0,0 +1,877 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+
+static kmem_cache_t *fuse_req_cachep;
+
+static inline struct fuse_conn *fuse_get_conn(struct file *file)
+{
+	struct fuse_conn *fc;
+	spin_lock(&fuse_lock);
+	fc = file->private_data;
+	if (fc && !fc->mounted)
+		fc = NULL;
+	spin_unlock(&fuse_lock);
+	return fc;
+}
+
+static inline void fuse_request_init(struct fuse_req *req)
+{
+	memset(req, 0, sizeof(*req));
+	INIT_LIST_HEAD(&req->list);
+	init_waitqueue_head(&req->waitq);
+	atomic_set(&req->count, 1);
+}
+
+struct fuse_req *fuse_request_alloc(void)
+{
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
+	if (req)
+		fuse_request_init(req);
+	return req;
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+	kmem_cache_free(fuse_req_cachep, req);
+}
+
+static inline void block_sigs(sigset_t *oldset)
+{
+	sigset_t mask;
+
+	siginitsetinv(&mask, sigmask(SIGKILL));
+	sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static inline void restore_sigs(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
+void fuse_reset_request(struct fuse_req *req)
+{
+	int preallocated = req->preallocated;
+	BUG_ON(atomic_read(&req->count) != 1);
+	fuse_request_init(req);
+	req->preallocated = preallocated;
+}
+
+static void __fuse_get_request(struct fuse_req *req)
+{
+	atomic_inc(&req->count);
+}
+
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+	BUG_ON(atomic_read(&req->count) < 2);
+	atomic_dec(&req->count);
+}
+
+static struct fuse_req *do_get_request(struct fuse_conn *fc)
+{
+	struct fuse_req *req;
+
+	spin_lock(&fuse_lock);
+	BUG_ON(list_empty(&fc->unused_list));
+	req = list_entry(fc->unused_list.next, struct fuse_req, list);
+	list_del_init(&req->list);
+	spin_unlock(&fuse_lock);
+	fuse_request_init(req);
+	req->preallocated = 1;
+	req->in.h.uid = current->fsuid;
+	req->in.h.gid = current->fsgid;
+	req->in.h.pid = current->pid;
+	return req;
+}
+
+/* This can return NULL, but only in case it's interrupted by a SIGKILL */
+struct fuse_req *fuse_get_request(struct fuse_conn *fc)
+{
+	int intr;
+	sigset_t oldset;
+
+	block_sigs(&oldset);
+	intr = down_interruptible(&fc->outstanding_sem);
+	restore_sigs(&oldset);
+	return intr ? NULL : do_get_request(fc);
+}
+
+static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	spin_lock(&fuse_lock);
+	if (req->preallocated)
+		list_add(&req->list, &fc->unused_list);
+	else
+		fuse_request_free(req);
+
+	/* If we are in debt decrease that first */
+	if (fc->outstanding_debt)
+		fc->outstanding_debt--;
+	else
+		up(&fc->outstanding_sem);
+	spin_unlock(&fuse_lock);
+}
+
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (atomic_dec_and_test(&req->count))
+		fuse_putback_request(fc, req);
+}
+
+void fuse_release_background(struct fuse_req *req)
+{
+	iput(req->inode);
+	iput(req->inode2);
+	if (req->file)
+		fput(req->file);
+	spin_lock(&fuse_lock);
+	list_del(&req->bg_entry);
+	spin_unlock(&fuse_lock);
+}
+
+/*
+ * This function is called when a request is finished.  Either a reply
+ * has arrived or it was interrupted (and not yet sent) or some error
+ * occured during communication with userspace, or the device file was
+ * closed.  It decreases the referece count for the request.  In case
+ * of a background request the referece to the stored objects are
+ * released.  The requester thread is woken up (if still waiting), and
+ * finally the request is either freed or put on the unused_list
+ *
+ * Called with fuse_lock, unlocks it
+ */
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int putback;
+	req->finished = 1;
+	putback = atomic_dec_and_test(&req->count);
+	spin_unlock(&fuse_lock);
+	if (req->background) {
+		down_read(&fc->sbput_sem);
+		if (fc->mounted)
+			fuse_release_background(req);
+		up_read(&fc->sbput_sem);
+	}
+	wake_up(&req->waitq);
+	if (req->in.h.opcode == FUSE_INIT) {
+		int i;
+
+		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
+			fc->conn_error = 1;
+
+		/* After INIT reply is received other requests can go
+		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+		   up()s on outstanding_sem.  The last up() is done in
+		   fuse_putback_request() */
+		for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+			up(&fc->outstanding_sem);
+	}
+	if (putback)
+		fuse_putback_request(fc, req);
+}
+
+/*
+ * Unfortunately request interruption not just solves the deadlock
+ * problem, it causes problems too.  These stem from the fact, that an
+ * interrupted request is continued to be processed in userspace,
+ * while all the locks and object references (inode and file) held
+ * during the operation are released.
+ *
+ * To release the locks is exactly why there's a need to interrupt the
+ * request, so there's not a lot that can be done about this, except
+ * introduce additional locking in userspace.
+ *
+ * More important is to keep inode and file references until userspace
+ * has replied, otherwise FORGET and RELEASE could be sent while the
+ * inode/file is still used by the filesystem.
+ *
+ * For this reason the concept of "background" request is introduced.
+ * An interrupted request is backgrounded if it has been already sent
+ * to userspace.  Backgrounding involves getting an extra reference to
+ * inode(s) or file used in the request, and adding the request to
+ * fc->background list.  When a reply is received for a background
+ * request, the object references are released, and the request is
+ * removed from the list.  If the filesystem is unmounted while there
+ * are still background requests, the list is walked and references
+ * are released as if a reply was received.
+ *
+ * There's one more use for a background request.  The RELEASE message is
+ * always sent as background, since it doesn't return an error or
+ * data.
+ */
+static void background_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->background = 1;
+	list_add(&req->bg_entry, &fc->background);
+	if (req->inode)
+		req->inode = igrab(req->inode);
+	if (req->inode2)
+		req->inode2 = igrab(req->inode2);
+	if (req->file)
+		get_file(req->file);
+}
+
+/* Called with fuse_lock held.  Releases, and then reacquires it. */
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+{
+	sigset_t oldset;
+
+	spin_unlock(&fuse_lock);
+	block_sigs(&oldset);
+	wait_event_interruptible(req->waitq, req->finished);
+	restore_sigs(&oldset);
+	spin_lock(&fuse_lock);
+	if (req->finished)
+		return;
+
+	req->out.h.error = -EINTR;
+	req->interrupted = 1;
+	if (req->locked) {
+		/* This is uninterruptible sleep, because data is
+		   being copied to/from the buffers of req.  During
+		   locked state, there mustn't be any filesystem
+		   operation (e.g. page fault), since that could lead
+		   to deadlock */
+		spin_unlock(&fuse_lock);
+		wait_event(req->waitq, !req->locked);
+		spin_lock(&fuse_lock);
+	}
+	if (!req->sent && !list_empty(&req->list)) {
+		list_del(&req->list);
+		__fuse_put_request(req);
+	} else if (!req->finished && req->sent)
+		background_request(fc, req);
+}
+
+static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+{
+	unsigned nbytes = 0;
+	unsigned i;
+
+	for (i = 0; i < numargs; i++)
+		nbytes += args[i].size;
+
+	return nbytes;
+}
+
+static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	fc->reqctr++;
+	/* zero is special */
+	if (fc->reqctr == 0)
+		fc->reqctr = 1;
+	req->in.h.unique = fc->reqctr;
+	req->in.h.len = sizeof(struct fuse_in_header) +
+		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+	if (!req->preallocated) {
+		/* If request is not preallocated (either FORGET or
+		   RELEASE), then still decrease outstanding_sem, so
+		   user can't open infinite number of files while not
+		   processing the RELEASE requests.  However for
+		   efficiency do it without blocking, so if down()
+		   would block, just increase the debt instead */
+		if (down_trylock(&fc->outstanding_sem))
+			fc->outstanding_debt++;
+	}
+	list_add_tail(&req->list, &fc->pending);
+	wake_up(&fc->waitq);
+}
+
+/*
+ * This can only be interrupted by a SIGKILL
+ */
+void request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	spin_lock(&fuse_lock);
+	if (!fc->connected)
+		req->out.h.error = -ENOTCONN;
+	else if (fc->conn_error)
+		req->out.h.error = -ECONNREFUSED;
+	else {
+		queue_request(fc, req);
+		/* acquire extra reference, since request is still needed
+		   after request_end() */
+		__fuse_get_request(req);
+
+		request_wait_answer(fc, req);
+	}
+	spin_unlock(&fuse_lock);
+}
+
+static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+{
+	spin_lock(&fuse_lock);
+	if (fc->connected) {
+		queue_request(fc, req);
+		spin_unlock(&fuse_lock);
+	} else {
+		req->out.h.error = -ENOTCONN;
+		request_end(fc, req);
+	}
+}
+
+void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 0;
+	request_send_nowait(fc, req);
+}
+
+void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	spin_lock(&fuse_lock);
+	background_request(fc, req);
+	spin_unlock(&fuse_lock);
+	request_send_nowait(fc, req);
+}
+
+void fuse_send_init(struct fuse_conn *fc)
+{
+	/* This is called from fuse_read_super() so there's guaranteed
+	   to be a request available */
+	struct fuse_req *req = do_get_request(fc);
+	struct fuse_init_in_out *arg = &req->misc.init_in_out;
+	arg->major = FUSE_KERNEL_VERSION;
+	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	req->in.h.opcode = FUSE_INIT;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(*arg);
+	req->out.args[0].value = arg;
+	request_send_background(fc, req);
+}
+
+/*
+ * Lock the request.  Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault.  If the request was already
+ * interrupted bail out.
+ */
+static inline int lock_request(struct fuse_req *req)
+{
+	int err = 0;
+	if (req) {
+		spin_lock(&fuse_lock);
+		if (req->interrupted)
+			err = -ENOENT;
+		else
+			req->locked = 1;
+		spin_unlock(&fuse_lock);
+	}
+	return err;
+}
+
+/*
+ * Unlock request.  If it was interrupted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
+static inline void unlock_request(struct fuse_req *req)
+{
+	if (req) {
+		spin_lock(&fuse_lock);
+		req->locked = 0;
+		if (req->interrupted)
+			wake_up(&req->waitq);
+		spin_unlock(&fuse_lock);
+	}
+}
+
+struct fuse_copy_state {
+	int write;
+	struct fuse_req *req;
+	const struct iovec *iov;
+	unsigned long nr_segs;
+	unsigned long seglen;
+	unsigned long addr;
+	struct page *pg;
+	void *mapaddr;
+	void *buf;
+	unsigned len;
+};
+
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
+			   struct fuse_req *req, const struct iovec *iov,
+			   unsigned long nr_segs)
+{
+	memset(cs, 0, sizeof(*cs));
+	cs->write = write;
+	cs->req = req;
+	cs->iov = iov;
+	cs->nr_segs = nr_segs;
+}
+
+/* Unmap and put previous page of userspace buffer */
+static inline void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+	if (cs->mapaddr) {
+		kunmap_atomic(cs->mapaddr, KM_USER0);
+		if (cs->write) {
+			flush_dcache_page(cs->pg);
+			set_page_dirty_lock(cs->pg);
+		}
+		put_page(cs->pg);
+		cs->mapaddr = NULL;
+	}
+}
+
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+	unsigned long offset;
+	int err;
+
+	unlock_request(cs->req);
+	fuse_copy_finish(cs);
+	if (!cs->seglen) {
+		BUG_ON(!cs->nr_segs);
+		cs->seglen = cs->iov[0].iov_len;
+		cs->addr = (unsigned long) cs->iov[0].iov_base;
+		cs->iov ++;
+		cs->nr_segs --;
+	}
+	down_read(&current->mm->mmap_sem);
+	err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
+			     &cs->pg, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (err < 0)
+		return err;
+	BUG_ON(err != 1);
+	offset = cs->addr % PAGE_SIZE;
+	cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+	cs->buf = cs->mapaddr + offset;
+	cs->len = min(PAGE_SIZE - offset, cs->seglen);
+	cs->seglen -= cs->len;
+	cs->addr += cs->len;
+
+	return lock_request(cs->req);
+}
+
+/* Do as much copy to/from userspace buffer as we can */
+static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
+			       unsigned *size)
+{
+	unsigned ncpy = min(*size, cs->len);
+	if (val) {
+		if (cs->write)
+			memcpy(cs->buf, *val, ncpy);
+		else
+			memcpy(*val, cs->buf, ncpy);
+		*val += ncpy;
+	}
+	*size -= ncpy;
+	cs->len -= ncpy;
+	cs->buf += ncpy;
+	return ncpy;
+}
+
+/*
+ * Copy a page in the request to/from the userspace buffer.  Must be
+ * done atomically
+ */
+static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+				 unsigned offset, unsigned count, int zeroing)
+{
+	if (page && zeroing && count < PAGE_SIZE) {
+		void *mapaddr = kmap_atomic(page, KM_USER1);
+		memset(mapaddr, 0, PAGE_SIZE);
+		kunmap_atomic(mapaddr, KM_USER1);
+	}
+	while (count) {
+		int err;
+		if (!cs->len && (err = fuse_copy_fill(cs)))
+			return err;
+		if (page) {
+			void *mapaddr = kmap_atomic(page, KM_USER1);
+			void *buf = mapaddr + offset;
+			offset += fuse_copy_do(cs, &buf, &count);
+			kunmap_atomic(mapaddr, KM_USER1);
+		} else
+			offset += fuse_copy_do(cs, NULL, &count);
+	}
+	if (page && !cs->write)
+		flush_dcache_page(page);
+	return 0;
+}
+
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+	unsigned offset = req->page_offset;
+	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
+
+	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+		struct page *page = req->pages[i];
+		int err = fuse_copy_page(cs, page, offset, count, zeroing);
+		if (err)
+			return err;
+
+		nbytes -= count;
+		count = min(nbytes, (unsigned) PAGE_SIZE);
+		offset = 0;
+	}
+	return 0;
+}
+
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+	while (size) {
+		int err;
+		if (!cs->len && (err = fuse_copy_fill(cs)))
+			return err;
+		fuse_copy_do(cs, &val, &size);
+	}
+	return 0;
+}
+
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+			  unsigned argpages, struct fuse_arg *args,
+			  int zeroing)
+{
+	int err = 0;
+	unsigned i;
+
+	for (i = 0; !err && i < numargs; i++)  {
+		struct fuse_arg *arg = &args[i];
+		if (i == numargs - 1 && argpages)
+			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else
+			err = fuse_copy_one(cs, arg->value, arg->size);
+	}
+	return err;
+}
+
+/* Wait until a request is available on the pending list */
+static void request_wait(struct fuse_conn *fc)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&fc->waitq, &wait);
+	while (fc->mounted && list_empty(&fc->pending)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
+
+		spin_unlock(&fuse_lock);
+		schedule();
+		spin_lock(&fuse_lock);
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&fc->waitq, &wait);
+}
+
+/*
+ * Read a single request into the userspace filesystem's buffer.  This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer.  If
+ * no reply is needed (FORGET) or request has been interrupted or
+ * there was an error during the copying then it's finished by calling
+ * request_end().  Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t *off)
+{
+	int err;
+	struct fuse_conn *fc;
+	struct fuse_req *req;
+	struct fuse_in *in;
+	struct fuse_copy_state cs;
+	unsigned reqsize;
+
+	spin_lock(&fuse_lock);
+	fc = file->private_data;
+	err = -EPERM;
+	if (!fc)
+		goto err_unlock;
+	request_wait(fc);
+	err = -ENODEV;
+	if (!fc->mounted)
+		goto err_unlock;
+	err = -ERESTARTSYS;
+	if (list_empty(&fc->pending))
+		goto err_unlock;
+
+	req = list_entry(fc->pending.next, struct fuse_req, list);
+	list_del_init(&req->list);
+	spin_unlock(&fuse_lock);
+
+	in = &req->in;
+	reqsize = req->in.h.len;
+	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	err = -EINVAL;
+	if (iov_length(iov, nr_segs) >= reqsize) {
+		err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+		if (!err)
+			err = fuse_copy_args(&cs, in->numargs, in->argpages,
+					     (struct fuse_arg *) in->args, 0);
+	}
+	fuse_copy_finish(&cs);
+
+	spin_lock(&fuse_lock);
+	req->locked = 0;
+	if (!err && req->interrupted)
+		err = -ENOENT;
+	if (err) {
+		if (!req->interrupted)
+			req->out.h.error = -EIO;
+		request_end(fc, req);
+		return err;
+	}
+	if (!req->isreply)
+		request_end(fc, req);
+	else {
+		req->sent = 1;
+		list_add_tail(&req->list, &fc->processing);
+		spin_unlock(&fuse_lock);
+	}
+	return reqsize;
+
+ err_unlock:
+	spin_unlock(&fuse_lock);
+	return err;
+}
+
+static ssize_t fuse_dev_read(struct file *file, char __user *buf,
+			     size_t nbytes, loff_t *off)
+{
+	struct iovec iov;
+	iov.iov_len = nbytes;
+	iov.iov_base = buf;
+	return fuse_dev_readv(file, &iov, 1, off);
+}
+
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+{
+	struct list_head *entry;
+
+	list_for_each(entry, &fc->processing) {
+		struct fuse_req *req;
+		req = list_entry(entry, struct fuse_req, list);
+		if (req->in.h.unique == unique)
+			return req;
+	}
+	return NULL;
+}
+
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+			 unsigned nbytes)
+{
+	unsigned reqsize = sizeof(struct fuse_out_header);
+
+	if (out->h.error)
+		return nbytes != reqsize ? -EINVAL : 0;
+
+	reqsize += len_args(out->numargs, out->args);
+
+	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+		return -EINVAL;
+	else if (reqsize > nbytes) {
+		struct fuse_arg *lastarg = &out->args[out->numargs-1];
+		unsigned diffsize = reqsize - nbytes;
+		if (diffsize > lastarg->size)
+			return -EINVAL;
+		lastarg->size -= diffsize;
+	}
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
+			      out->page_zeroing);
+}
+
+/*
+ * Write a single reply to a request.  First the header is copied from
+ * the write buffer.  The request is then searched on the processing
+ * list by the unique ID found in the header.  If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
+static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t *off)
+{
+	int err;
+	unsigned nbytes = iov_length(iov, nr_segs);
+	struct fuse_req *req;
+	struct fuse_out_header oh;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -ENODEV;
+
+	fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+	if (nbytes < sizeof(struct fuse_out_header))
+		return -EINVAL;
+
+	err = fuse_copy_one(&cs, &oh, sizeof(oh));
+	if (err)
+		goto err_finish;
+	err = -EINVAL;
+	if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
+	    oh.len != nbytes)
+		goto err_finish;
+
+	spin_lock(&fuse_lock);
+	req = request_find(fc, oh.unique);
+	err = -EINVAL;
+	if (!req)
+		goto err_unlock;
+
+	list_del_init(&req->list);
+	if (req->interrupted) {
+		request_end(fc, req);
+		fuse_copy_finish(&cs);
+		return -ENOENT;
+	}
+	req->out.h = oh;
+	req->locked = 1;
+	cs.req = req;
+	spin_unlock(&fuse_lock);
+
+	err = copy_out_args(&cs, &req->out, nbytes);
+	fuse_copy_finish(&cs);
+
+	spin_lock(&fuse_lock);
+	req->locked = 0;
+	if (!err) {
+		if (req->interrupted)
+			err = -ENOENT;
+	} else if (!req->interrupted)
+		req->out.h.error = -EIO;
+	request_end(fc, req);
+
+	return err ? err : nbytes;
+
+ err_unlock:
+	spin_unlock(&fuse_lock);
+ err_finish:
+	fuse_copy_finish(&cs);
+	return err;
+}
+
+static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
+			      size_t nbytes, loff_t *off)
+{
+	struct iovec iov;
+	iov.iov_len = nbytes;
+	iov.iov_base = (char __user *) buf;
+	return fuse_dev_writev(file, &iov, 1, off);
+}
+
+static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	unsigned mask = POLLOUT | POLLWRNORM;
+
+	if (!fc)
+		return -ENODEV;
+
+	poll_wait(file, &fc->waitq, wait);
+
+	spin_lock(&fuse_lock);
+	if (!list_empty(&fc->pending))
+                mask |= POLLIN | POLLRDNORM;
+	spin_unlock(&fuse_lock);
+
+	return mask;
+}
+
+/* Abort all requests on the given list (pending or processing) */
+static void end_requests(struct fuse_conn *fc, struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct fuse_req *req;
+		req = list_entry(head->next, struct fuse_req, list);
+		list_del_init(&req->list);
+		req->out.h.error = -ECONNABORTED;
+		request_end(fc, req);
+		spin_lock(&fuse_lock);
+	}
+}
+
+static int fuse_dev_release(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc;
+
+	spin_lock(&fuse_lock);
+	fc = file->private_data;
+	if (fc) {
+		fc->connected = 0;
+		end_requests(fc, &fc->pending);
+		end_requests(fc, &fc->processing);
+		fuse_release_conn(fc);
+	}
+	spin_unlock(&fuse_lock);
+	return 0;
+}
+
+struct file_operations fuse_dev_operations = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= fuse_dev_read,
+	.readv		= fuse_dev_readv,
+	.write		= fuse_dev_write,
+	.writev		= fuse_dev_writev,
+	.poll		= fuse_dev_poll,
+	.release	= fuse_dev_release,
+};
+
+static struct miscdevice fuse_miscdevice = {
+	.minor = FUSE_MINOR,
+	.name  = "fuse",
+	.fops = &fuse_dev_operations,
+};
+
+int __init fuse_dev_init(void)
+{
+	int err = -ENOMEM;
+	fuse_req_cachep = kmem_cache_create("fuse_request",
+					    sizeof(struct fuse_req),
+					    0, 0, NULL, NULL);
+	if (!fuse_req_cachep)
+		goto out;
+
+	err = misc_register(&fuse_miscdevice);
+	if (err)
+		goto out_cache_clean;
+
+	return 0;
+
+ out_cache_clean:
+	kmem_cache_destroy(fuse_req_cachep);
+ out:
+	return err;
+}
+
+void fuse_dev_cleanup(void)
+{
+	misc_deregister(&fuse_miscdevice);
+	kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
new file mode 100644
index 00000000000..e79e49b3eec
--- /dev/null
+++ b/fs/fuse/dir.c
@@ -0,0 +1,982 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+
+static inline unsigned long time_to_jiffies(unsigned long sec,
+					    unsigned long nsec)
+{
+	struct timespec ts = {sec, nsec};
+	return jiffies + timespec_to_jiffies(&ts);
+}
+
+static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
+			     struct dentry *entry,
+			     struct fuse_entry_out *outarg)
+{
+	req->in.h.opcode = FUSE_LOOKUP;
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_entry_out);
+	req->out.args[0].value = outarg;
+}
+
+static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
+{
+	if (!entry->d_inode || is_bad_inode(entry->d_inode))
+		return 0;
+	else if (time_after(jiffies, entry->d_time)) {
+		int err;
+		struct fuse_entry_out outarg;
+		struct inode *inode = entry->d_inode;
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_req *req = fuse_get_request(fc);
+		if (!req)
+			return 0;
+
+		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
+		request_send(fc, req);
+		err = req->out.h.error;
+		if (!err) {
+			if (outarg.nodeid != get_node_id(inode)) {
+				fuse_send_forget(fc, req, outarg.nodeid, 1);
+				return 0;
+			}
+			fi->nlookup ++;
+		}
+		fuse_put_request(fc, req);
+		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+			return 0;
+
+		fuse_change_attributes(inode, &outarg.attr);
+		entry->d_time = time_to_jiffies(outarg.entry_valid,
+						outarg.entry_valid_nsec);
+		fi->i_time = time_to_jiffies(outarg.attr_valid,
+					     outarg.attr_valid_nsec);
+	}
+	return 1;
+}
+
+static struct dentry_operations fuse_dentry_operations = {
+	.d_revalidate	= fuse_dentry_revalidate,
+};
+
+static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
+			    struct inode **inodep)
+{
+	int err;
+	struct fuse_entry_out outarg;
+	struct inode *inode = NULL;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req;
+
+	if (entry->d_name.len > FUSE_NAME_MAX)
+		return -ENAMETOOLONG;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	fuse_lookup_init(req, dir, entry, &outarg);
+	request_send(fc, req);
+	err = req->out.h.error;
+	if (!err) {
+		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+				  &outarg.attr);
+		if (!inode) {
+			fuse_send_forget(fc, req, outarg.nodeid, 1);
+			return -ENOMEM;
+		}
+	}
+	fuse_put_request(fc, req);
+	if (err && err != -ENOENT)
+		return err;
+
+	if (inode) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		entry->d_time =	time_to_jiffies(outarg.entry_valid,
+						outarg.entry_valid_nsec);
+		fi->i_time = time_to_jiffies(outarg.attr_valid,
+					     outarg.attr_valid_nsec);
+	}
+
+	entry->d_op = &fuse_dentry_operations;
+	*inodep = inode;
+	return 0;
+}
+
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	entry->d_time = jiffies - 1;
+}
+
+static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
+			    struct inode *dir, struct dentry *entry,
+			    int mode)
+{
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	struct fuse_inode *fi;
+	int err;
+
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	if (err) {
+		fuse_put_request(fc, req);
+		return err;
+	}
+	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+			  &outarg.attr);
+	if (!inode) {
+		fuse_send_forget(fc, req, outarg.nodeid, 1);
+		return -ENOMEM;
+	}
+	fuse_put_request(fc, req);
+
+	/* Don't allow userspace to do really stupid things... */
+	if ((inode->i_mode ^ mode) & S_IFMT) {
+		iput(inode);
+		return -EIO;
+	}
+
+	entry->d_time = time_to_jiffies(outarg.entry_valid,
+					outarg.entry_valid_nsec);
+
+	fi = get_fuse_inode(inode);
+	fi->i_time = time_to_jiffies(outarg.attr_valid,
+				     outarg.attr_valid_nsec);
+
+	d_instantiate(entry, inode);
+	fuse_invalidate_attr(dir);
+	return 0;
+}
+
+static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
+		      dev_t rdev)
+{
+	struct fuse_mknod_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	inarg.rdev = new_encode_dev(rdev);
+	req->in.h.opcode = FUSE_MKNOD;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, req, dir, entry, mode);
+}
+
+static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
+		       struct nameidata *nd)
+{
+	return fuse_mknod(dir, entry, mode, 0);
+}
+
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
+{
+	struct fuse_mkdir_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.mode = mode;
+	req->in.h.opcode = FUSE_MKDIR;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = entry->d_name.len + 1;
+	req->in.args[1].value = entry->d_name.name;
+	return create_new_entry(fc, req, dir, entry, S_IFDIR);
+}
+
+static int fuse_symlink(struct inode *dir, struct dentry *entry,
+			const char *link)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	unsigned len = strlen(link) + 1;
+	struct fuse_req *req;
+
+	if (len > FUSE_SYMLINK_MAX)
+		return -ENAMETOOLONG;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->in.h.opcode = FUSE_SYMLINK;
+	req->in.numargs = 2;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	req->in.args[1].size = len;
+	req->in.args[1].value = link;
+	return create_new_entry(fc, req, dir, entry, S_IFLNK);
+}
+
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->in.h.opcode = FUSE_UNLINK;
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		struct inode *inode = entry->d_inode;
+
+		/* Set nlink to zero so the inode can be cleared, if
+                   the inode does have more links this will be
+                   discovered at the next lookup/getattr */
+		inode->i_nlink = 0;
+		fuse_invalidate_attr(inode);
+		fuse_invalidate_attr(dir);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+	int err;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->in.h.opcode = FUSE_RMDIR;
+	req->in.h.nodeid = get_node_id(dir);
+	req->inode = dir;
+	req->in.numargs = 1;
+	req->in.args[0].size = entry->d_name.len + 1;
+	req->in.args[0].value = entry->d_name.name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		entry->d_inode->i_nlink = 0;
+		fuse_invalidate_attr(dir);
+	} else if (err == -EINTR)
+		fuse_invalidate_entry(entry);
+	return err;
+}
+
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+		       struct inode *newdir, struct dentry *newent)
+{
+	int err;
+	struct fuse_rename_in inarg;
+	struct fuse_conn *fc = get_fuse_conn(olddir);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.newdir = get_node_id(newdir);
+	req->in.h.opcode = FUSE_RENAME;
+	req->in.h.nodeid = get_node_id(olddir);
+	req->inode = olddir;
+	req->inode2 = newdir;
+	req->in.numargs = 3;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = oldent->d_name.len + 1;
+	req->in.args[1].value = oldent->d_name.name;
+	req->in.args[2].size = newent->d_name.len + 1;
+	req->in.args[2].value = newent->d_name.name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		fuse_invalidate_attr(olddir);
+		if (olddir != newdir)
+			fuse_invalidate_attr(newdir);
+	} else if (err == -EINTR) {
+		/* If request was interrupted, DEITY only knows if the
+		   rename actually took place.  If the invalidation
+		   fails (e.g. some process has CWD under the renamed
+		   directory), then there can be inconsistency between
+		   the dcache and the real filesystem.  Tough luck. */
+		fuse_invalidate_entry(oldent);
+		if (newent->d_inode)
+			fuse_invalidate_entry(newent);
+	}
+
+	return err;
+}
+
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+		     struct dentry *newent)
+{
+	int err;
+	struct fuse_link_in inarg;
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.oldnodeid = get_node_id(inode);
+	req->in.h.opcode = FUSE_LINK;
+	req->inode2 = inode;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = newent->d_name.len + 1;
+	req->in.args[1].value = newent->d_name.name;
+	err = create_new_entry(fc, req, newdir, newent, inode->i_mode);
+	/* Contrary to "normal" filesystems it can happen that link
+	   makes two "logical" inodes point to the same "physical"
+	   inode.  We invalidate the attributes of the old one, so it
+	   will reflect changes in the backing inode (link count,
+	   etc.)
+	*/
+	if (!err || err == -EINTR)
+		fuse_invalidate_attr(inode);
+	return err;
+}
+
+int fuse_do_getattr(struct inode *inode)
+{
+	int err;
+	struct fuse_attr_out arg;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->in.h.opcode = FUSE_GETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(arg);
+	req->out.args[0].value = &arg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		if ((inode->i_mode ^ arg.attr.mode) & S_IFMT) {
+			make_bad_inode(inode);
+			err = -EIO;
+		} else {
+			struct fuse_inode *fi = get_fuse_inode(inode);
+			fuse_change_attributes(inode, &arg.attr);
+			fi->i_time = time_to_jiffies(arg.attr_valid,
+						     arg.attr_valid_nsec);
+		}
+	}
+	return err;
+}
+
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the requester process.  This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways.  For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege.  This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+{
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		return 1;
+
+	if (task->euid == fc->user_id &&
+	    task->suid == fc->user_id &&
+	    task->uid == fc->user_id &&
+	    task->egid == fc->group_id &&
+	    task->sgid == fc->group_id &&
+	    task->gid == fc->group_id)
+		return 1;
+
+	return 0;
+}
+
+static int fuse_revalidate(struct dentry *entry)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+	if (get_node_id(inode) != FUSE_ROOT_ID &&
+	    time_before_eq(jiffies, fi->i_time))
+		return 0;
+
+	return fuse_do_getattr(inode);
+}
+
+static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+	else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+		int err = generic_permission(inode, mask, NULL);
+
+		/* If permission is denied, try to refresh file
+		   attributes.  This is also needed, because the root
+		   node will at first have no permissions */
+		if (err == -EACCES) {
+		 	err = fuse_do_getattr(inode);
+			if (!err)
+				err = generic_permission(inode, mask, NULL);
+		}
+
+		/* FIXME: Need some mechanism to revoke permissions:
+		   currently if the filesystem suddenly changes the
+		   file mode, we will not be informed about it, and
+		   continue to allow access to the file/directory.
+
+		   This is actually not so grave, since the user can
+		   simply keep access to the file/directory anyway by
+		   keeping it open... */
+
+		return err;
+	} else {
+		int mode = inode->i_mode;
+		if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
+                    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+                        return -EROFS;
+		if ((mask & MAY_EXEC) && !S_ISDIR(mode) && !(mode & S_IXUGO))
+			return -EACCES;
+		return 0;
+	}
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+			 void *dstbuf, filldir_t filldir)
+{
+	while (nbytes >= FUSE_NAME_OFFSET) {
+		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+		int over;
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+
+		over = filldir(dstbuf, dirent->name, dirent->namelen,
+			       file->f_pos, dirent->ino, dirent->type);
+		if (over)
+			break;
+
+		buf += reclen;
+		nbytes -= reclen;
+		file->f_pos = dirent->off;
+	}
+
+	return 0;
+}
+
+static inline size_t fuse_send_readdir(struct fuse_req *req, struct file *file,
+				       struct inode *inode, loff_t pos,
+				       size_t count)
+{
+	return fuse_send_read_common(req, file, inode, pos, count, 1);
+}
+
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+	int err;
+	size_t nbytes;
+	struct page *page;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		fuse_put_request(fc, req);
+		return -ENOMEM;
+	}
+	req->num_pages = 1;
+	req->pages[0] = page;
+	nbytes = fuse_send_readdir(req, file, inode, file->f_pos, PAGE_SIZE);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
+				    filldir);
+
+	__free_page(page);
+	fuse_invalidate_attr(inode); /* atime changed */
+	return err;
+}
+
+static char *read_link(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = fuse_get_request(fc);
+	char *link;
+
+	if (!req)
+		return ERR_PTR(-EINTR);
+
+	link = (char *) __get_free_page(GFP_KERNEL);
+	if (!link) {
+		link = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	req->in.h.opcode = FUSE_READLINK;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = PAGE_SIZE - 1;
+	req->out.args[0].value = link;
+	request_send(fc, req);
+	if (req->out.h.error) {
+		free_page((unsigned long) link);
+		link = ERR_PTR(req->out.h.error);
+	} else
+		link[req->out.args[0].size] = '\0';
+ out:
+	fuse_put_request(fc, req);
+	fuse_invalidate_attr(inode); /* atime changed */
+	return link;
+}
+
+static void free_link(char *link)
+{
+	if (!IS_ERR(link))
+		free_page((unsigned long) link);
+}
+
+static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, read_link(dentry));
+	return NULL;
+}
+
+static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+	free_link(nd_get_link(nd));
+}
+
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, 1);
+}
+
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+	return fuse_release_common(inode, file, 1);
+}
+
+static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+{
+	/* nfsd can call this with no file */
+	return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
+}
+
+static unsigned iattr_to_fattr(struct iattr *iattr, struct fuse_attr *fattr)
+{
+	unsigned ivalid = iattr->ia_valid;
+	unsigned fvalid = 0;
+
+	memset(fattr, 0, sizeof(*fattr));
+
+	if (ivalid & ATTR_MODE)
+		fvalid |= FATTR_MODE,   fattr->mode = iattr->ia_mode;
+	if (ivalid & ATTR_UID)
+		fvalid |= FATTR_UID,    fattr->uid = iattr->ia_uid;
+	if (ivalid & ATTR_GID)
+		fvalid |= FATTR_GID,    fattr->gid = iattr->ia_gid;
+	if (ivalid & ATTR_SIZE)
+		fvalid |= FATTR_SIZE,   fattr->size = iattr->ia_size;
+	/* You can only _set_ these together (they may change by themselves) */
+	if ((ivalid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME)) {
+		fvalid |= FATTR_ATIME | FATTR_MTIME;
+		fattr->atime = iattr->ia_atime.tv_sec;
+		fattr->mtime = iattr->ia_mtime.tv_sec;
+	}
+
+	return fvalid;
+}
+
+static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+	int err;
+	int is_truncate = 0;
+
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+		err = inode_change_ok(inode, attr);
+		if (err)
+			return err;
+	}
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		unsigned long limit;
+		is_truncate = 1;
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
+		}
+	}
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.valid = iattr_to_fattr(attr, &inarg.attr);
+	req->in.h.opcode = FUSE_SETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err) {
+		if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+			make_bad_inode(inode);
+			err = -EIO;
+		} else {
+			if (is_truncate) {
+				loff_t origsize = i_size_read(inode);
+				i_size_write(inode, outarg.attr.size);
+				if (origsize > outarg.attr.size)
+					vmtruncate(inode, outarg.attr.size);
+			}
+			fuse_change_attributes(inode, &outarg.attr);
+			fi->i_time = time_to_jiffies(outarg.attr_valid,
+						     outarg.attr_valid_nsec);
+		}
+	} else if (err == -EINTR)
+		fuse_invalidate_attr(inode);
+
+	return err;
+}
+
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+			struct kstat *stat)
+{
+	struct inode *inode = entry->d_inode;
+	int err = fuse_revalidate(entry);
+	if (!err)
+		generic_fillattr(inode, stat);
+
+	return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
+{
+	struct inode *inode;
+	int err = fuse_lookup_iget(dir, entry, &inode);
+	if (err)
+		return ERR_PTR(err);
+	if (inode && S_ISDIR(inode->i_mode)) {
+		/* Don't allow creating an alias to a directory  */
+		struct dentry *alias = d_find_alias(inode);
+		if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+			dput(alias);
+			iput(inode);
+			return ERR_PTR(-EIO);
+		}
+	}
+	return d_splice_alias(inode, entry);
+}
+
+static int fuse_setxattr(struct dentry *entry, const char *name,
+			 const void *value, size_t size, int flags)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_setxattr_in inarg;
+	int err;
+
+	if (size > FUSE_XATTR_SIZE_MAX)
+		return -E2BIG;
+
+	if (fc->no_setxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	inarg.flags = flags;
+	req->in.h.opcode = FUSE_SETXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 3;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = strlen(name) + 1;
+	req->in.args[1].value = name;
+	req->in.args[2].size = size;
+	req->in.args[2].value = value;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_setxattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
+			     void *value, size_t size)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_getxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	req->in.h.opcode = FUSE_GETXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = strlen(name) + 1;
+	req->in.args[1].value = name;
+	/* This is really two different operations rolled into one */
+	req->out.numargs = 1;
+	if (size) {
+		req->out.argvar = 1;
+		req->out.args[0].size = size;
+		req->out.args[0].value = value;
+	} else {
+		req->out.args[0].size = sizeof(outarg);
+		req->out.args[0].value = &outarg;
+	}
+	request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret)
+		ret = size ? req->out.args[0].size : outarg.size;
+	else {
+		if (ret == -ENOSYS) {
+			fc->no_getxattr = 1;
+			ret = -EOPNOTSUPP;
+		}
+	}
+	fuse_put_request(fc, req);
+	return ret;
+}
+
+static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_getxattr_in inarg;
+	struct fuse_getxattr_out outarg;
+	ssize_t ret;
+
+	if (fc->no_listxattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.size = size;
+	req->in.h.opcode = FUSE_LISTXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	/* This is really two different operations rolled into one */
+	req->out.numargs = 1;
+	if (size) {
+		req->out.argvar = 1;
+		req->out.args[0].size = size;
+		req->out.args[0].value = list;
+	} else {
+		req->out.args[0].size = sizeof(outarg);
+		req->out.args[0].value = &outarg;
+	}
+	request_send(fc, req);
+	ret = req->out.h.error;
+	if (!ret)
+		ret = size ? req->out.args[0].size : outarg.size;
+	else {
+		if (ret == -ENOSYS) {
+			fc->no_listxattr = 1;
+			ret = -EOPNOTSUPP;
+		}
+	}
+	fuse_put_request(fc, req);
+	return ret;
+}
+
+static int fuse_removexattr(struct dentry *entry, const char *name)
+{
+	struct inode *inode = entry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int err;
+
+	if (fc->no_removexattr)
+		return -EOPNOTSUPP;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->in.h.opcode = FUSE_REMOVEXATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = strlen(name) + 1;
+	req->in.args[0].value = name;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_removexattr = 1;
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+static struct inode_operations fuse_dir_inode_operations = {
+	.lookup		= fuse_lookup,
+	.mkdir		= fuse_mkdir,
+	.symlink	= fuse_symlink,
+	.unlink		= fuse_unlink,
+	.rmdir		= fuse_rmdir,
+	.rename		= fuse_rename,
+	.link		= fuse_link,
+	.setattr	= fuse_setattr,
+	.create		= fuse_create,
+	.mknod		= fuse_mknod,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+static struct file_operations fuse_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= fuse_readdir,
+	.open		= fuse_dir_open,
+	.release	= fuse_dir_release,
+	.fsync		= fuse_dir_fsync,
+};
+
+static struct inode_operations fuse_common_inode_operations = {
+	.setattr	= fuse_setattr,
+	.permission	= fuse_permission,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+static struct inode_operations fuse_symlink_inode_operations = {
+	.setattr	= fuse_setattr,
+	.follow_link	= fuse_follow_link,
+	.put_link	= fuse_put_link,
+	.readlink	= generic_readlink,
+	.getattr	= fuse_getattr,
+	.setxattr	= fuse_setxattr,
+	.getxattr	= fuse_getxattr,
+	.listxattr	= fuse_listxattr,
+	.removexattr	= fuse_removexattr,
+};
+
+void fuse_init_common(struct inode *inode)
+{
+	inode->i_op = &fuse_common_inode_operations;
+}
+
+void fuse_init_dir(struct inode *inode)
+{
+	inode->i_op = &fuse_dir_inode_operations;
+	inode->i_fop = &fuse_dir_operations;
+}
+
+void fuse_init_symlink(struct inode *inode)
+{
+	inode->i_op = &fuse_symlink_inode_operations;
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
new file mode 100644
index 00000000000..6454022b053
--- /dev/null
+++ b/fs/fuse/file.c
@@ -0,0 +1,555 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+
+static struct file_operations fuse_direct_io_file_operations;
+
+int fuse_open_common(struct inode *inode, struct file *file, int isdir)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_open_in inarg;
+	struct fuse_open_out outarg;
+	struct fuse_file *ff;
+	int err;
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	/* If opening the root node, no lookup has been performed on
+	   it, so the attributes must be refreshed */
+	if (get_node_id(inode) == FUSE_ROOT_ID) {
+		int err = fuse_do_getattr(inode);
+		if (err)
+		 	return err;
+	}
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	err = -ENOMEM;
+	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
+	if (!ff)
+		goto out_put_request;
+
+	ff->release_req = fuse_request_alloc();
+	if (!ff->release_req) {
+		kfree(ff);
+		goto out_put_request;
+	}
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+	req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	if (err) {
+		fuse_request_free(ff->release_req);
+		kfree(ff);
+	} else {
+		if (!isdir && (outarg.open_flags & FOPEN_DIRECT_IO))
+			file->f_op = &fuse_direct_io_file_operations;
+		if (!(outarg.open_flags & FOPEN_KEEP_CACHE))
+			invalidate_inode_pages(inode->i_mapping);
+		ff->fh = outarg.fh;
+		file->private_data = ff;
+	}
+
+ out_put_request:
+	fuse_put_request(fc, req);
+	return err;
+}
+
+int fuse_release_common(struct inode *inode, struct file *file, int isdir)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req = ff->release_req;
+	struct fuse_release_in *inarg = &req->misc.release_in;
+
+	inarg->fh = ff->fh;
+	inarg->flags = file->f_flags & ~O_EXCL;
+	req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_release_in);
+	req->in.args[0].value = inarg;
+	request_send_background(fc, req);
+	kfree(ff);
+
+	/* Return value is ignored by VFS */
+	return 0;
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
+{
+	return fuse_open_common(inode, file, 0);
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+	return fuse_release_common(inode, file, 0);
+}
+
+static int fuse_flush(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_flush_in inarg;
+	int err;
+
+	if (fc->no_flush)
+		return 0;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	req->in.h.opcode = FUSE_FLUSH;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		fc->no_flush = 1;
+		err = 0;
+	}
+	return err;
+}
+
+int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+		      int isdir)
+{
+	struct inode *inode = de->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_req *req;
+	struct fuse_fsync_in inarg;
+	int err;
+
+	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+		return 0;
+
+	req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.fh = ff->fh;
+	inarg.fsync_flags = datasync ? 1 : 0;
+	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (err == -ENOSYS) {
+		if (isdir)
+			fc->no_fsyncdir = 1;
+		else
+			fc->no_fsync = 1;
+		err = 0;
+	}
+	return err;
+}
+
+static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+{
+	return fuse_fsync_common(file, de, datasync, 0);
+}
+
+size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
+			     struct inode *inode, loff_t pos, size_t count,
+			     int isdir)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_read_in inarg;
+
+	memset(&inarg, 0, sizeof(struct fuse_read_in));
+	inarg.fh = ff->fh;
+	inarg.offset = pos;
+	inarg.size = count;
+	req->in.h.opcode = isdir ? FUSE_READDIR : FUSE_READ;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_read_in);
+	req->in.args[0].value = &inarg;
+	req->out.argpages = 1;
+	req->out.argvar = 1;
+	req->out.numargs = 1;
+	req->out.args[0].size = count;
+	request_send(fc, req);
+	return req->out.args[0].size;
+}
+
+static inline size_t fuse_send_read(struct fuse_req *req, struct file *file,
+				    struct inode *inode, loff_t pos,
+				    size_t count)
+{
+	return fuse_send_read_common(req, file, inode, pos, count, 0);
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
+	struct fuse_req *req = fuse_get_request(fc);
+	int err = -EINTR;
+	if (!req)
+		goto out;
+
+	req->out.page_zeroing = 1;
+	req->num_pages = 1;
+	req->pages[0] = page;
+	fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		SetPageUptodate(page);
+	fuse_invalidate_attr(inode); /* atime changed */
+ out:
+	unlock_page(page);
+	return err;
+}
+
+static int fuse_send_readpages(struct fuse_req *req, struct file *file,
+			       struct inode *inode)
+{
+	loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+	unsigned i;
+	req->out.page_zeroing = 1;
+	fuse_send_read(req, file, inode, pos, count);
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (!req->out.h.error)
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return req->out.h.error;
+}
+
+struct fuse_readpages_data {
+	struct fuse_req *req;
+	struct file *file;
+	struct inode *inode;
+};
+
+static int fuse_readpages_fill(void *_data, struct page *page)
+{
+	struct fuse_readpages_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int err = fuse_send_readpages(req, data->file, inode);
+		if (err) {
+			unlock_page(page);
+			return err;
+		}
+		fuse_reset_request(req);
+	}
+	req->pages[req->num_pages] = page;
+	req->num_pages ++;
+	return 0;
+}
+
+static int fuse_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_readpages_data data;
+	int err;
+	data.file = file;
+	data.inode = inode;
+	data.req = fuse_get_request(fc);
+	if (!data.req)
+		return -EINTR;
+
+	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+	if (!err && data.req->num_pages)
+		err = fuse_send_readpages(data.req, file, inode);
+	fuse_put_request(fc, data.req);
+	fuse_invalidate_attr(inode); /* atime changed */
+	return err;
+}
+
+static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+			      struct inode *inode, loff_t pos, size_t count)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	struct fuse_write_in inarg;
+	struct fuse_write_out outarg;
+
+	memset(&inarg, 0, sizeof(struct fuse_write_in));
+	inarg.fh = ff->fh;
+	inarg.offset = pos;
+	inarg.size = count;
+	req->in.h.opcode = FUSE_WRITE;
+	req->in.h.nodeid = get_node_id(inode);
+	req->inode = inode;
+	req->file = file;
+	req->in.argpages = 1;
+	req->in.numargs = 2;
+	req->in.args[0].size = sizeof(struct fuse_write_in);
+	req->in.args[0].value = &inarg;
+	req->in.args[1].size = count;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(struct fuse_write_out);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	return outarg.size;
+}
+
+static int fuse_prepare_write(struct file *file, struct page *page,
+			      unsigned offset, unsigned to)
+{
+	/* No op */
+	return 0;
+}
+
+static int fuse_commit_write(struct file *file, struct page *page,
+			     unsigned offset, unsigned to)
+{
+	int err;
+	size_t nres;
+	unsigned count = to - offset;
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->num_pages = 1;
+	req->pages[0] = page;
+	req->page_offset = offset;
+	nres = fuse_send_write(req, file, inode, pos, count);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err && nres != count)
+		err = -EIO;
+	if (!err) {
+		pos += count;
+		if (pos > i_size_read(inode))
+			i_size_write(inode, pos);
+
+		if (offset == 0 && to == PAGE_CACHE_SIZE) {
+			clear_page_dirty(page);
+			SetPageUptodate(page);
+		}
+	}
+	fuse_invalidate_attr(inode);
+	return err;
+}
+
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (write)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+			       unsigned nbytes, int write)
+{
+	unsigned long user_addr = (unsigned long) buf;
+	unsigned offset = user_addr & ~PAGE_MASK;
+	int npages;
+
+	/* This doesn't work with nfsd */
+	if (!current->mm)
+		return -EPERM;
+
+	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+	down_read(&current->mm->mmap_sem);
+	npages = get_user_pages(current, current->mm, user_addr, npages, write,
+				0, req->pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (npages < 0)
+		return npages;
+
+	req->num_pages = npages;
+	req->page_offset = offset;
+	return 0;
+}
+
+static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos, int write)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	size_t nmax = write ? fc->max_write : fc->max_read;
+	loff_t pos = *ppos;
+	ssize_t res = 0;
+	struct fuse_req *req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	while (count) {
+		size_t tmp;
+		size_t nres;
+		size_t nbytes = min(count, nmax);
+		int err = fuse_get_user_pages(req, buf, nbytes, !write);
+		if (err) {
+			res = err;
+			break;
+		}
+		tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+		nbytes = min(nbytes, tmp);
+		if (write)
+			nres = fuse_send_write(req, file, inode, pos, nbytes);
+		else
+			nres = fuse_send_read(req, file, inode, pos, nbytes);
+		fuse_release_user_pages(req, !write);
+		if (req->out.h.error) {
+			if (!res)
+				res = req->out.h.error;
+			break;
+		} else if (nres > nbytes) {
+			res = -EIO;
+			break;
+		}
+		count -= nres;
+		res += nres;
+		pos += nres;
+		buf += nres;
+		if (nres != nbytes)
+			break;
+		if (count)
+			fuse_reset_request(req);
+	}
+	fuse_put_request(fc, req);
+	if (res > 0) {
+		if (write && pos > i_size_read(inode))
+			i_size_write(inode, pos);
+		*ppos = pos;
+	}
+	fuse_invalidate_attr(inode);
+
+	return res;
+}
+
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	return fuse_direct_io(file, buf, count, ppos, 0);
+}
+
+static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t res;
+	/* Don't allow parallel writes to the same file */
+	down(&inode->i_sem);
+	res = fuse_direct_io(file, buf, count, ppos, 1);
+	up(&inode->i_sem);
+	return res;
+}
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_SHARED)) {
+		if ((vma->vm_flags & VM_WRITE))
+			return -ENODEV;
+		else
+			vma->vm_flags &= ~VM_MAYWRITE;
+	}
+	return generic_file_mmap(file, vma);
+}
+
+static int fuse_set_page_dirty(struct page *page)
+{
+	printk("fuse_set_page_dirty: should not happen\n");
+	dump_stack();
+	return 0;
+}
+
+static struct file_operations fuse_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= fuse_file_mmap,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	.sendfile	= generic_file_sendfile,
+};
+
+static struct file_operations fuse_direct_io_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= fuse_direct_read,
+	.write		= fuse_direct_write,
+	.open		= fuse_open,
+	.flush		= fuse_flush,
+	.release	= fuse_release,
+	.fsync		= fuse_fsync,
+	/* no mmap and sendfile */
+};
+
+static struct address_space_operations fuse_file_aops  = {
+	.readpage	= fuse_readpage,
+	.prepare_write	= fuse_prepare_write,
+	.commit_write	= fuse_commit_write,
+	.readpages	= fuse_readpages,
+	.set_page_dirty	= fuse_set_page_dirty,
+};
+
+void fuse_init_file_inode(struct inode *inode)
+{
+	inode->i_fop = &fuse_file_operations;
+	inode->i_data.a_ops = &fuse_file_aops;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
new file mode 100644
index 00000000000..24d761518d8
--- /dev/null
+++ b/fs/fuse/fuse_i.h
@@ -0,0 +1,451 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <asm/semaphore.h>
+
+/** Max number of pages that can be used in a single read request */
+#define FUSE_MAX_PAGES_PER_REQ 32
+
+/** If more requests are outstanding, then the operation will block */
+#define FUSE_MAX_OUTSTANDING 10
+
+/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
+    module will check permissions based on the file mode.  Otherwise no
+    permission checking is done in the kernel */
+#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
+
+/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
+    doing the mount will be allowed to access the filesystem */
+#define FUSE_ALLOW_OTHER         (1 << 1)
+
+
+/** FUSE inode */
+struct fuse_inode {
+	/** Inode data */
+	struct inode inode;
+
+	/** Unique ID, which identifies the inode between userspace
+	 * and kernel */
+	u64 nodeid;
+
+	/** Number of lookups on this inode */
+	u64 nlookup;
+
+	/** The request used for sending the FORGET message */
+	struct fuse_req *forget_req;
+
+	/** Time in jiffies until the file attributes are valid */
+	unsigned long i_time;
+};
+
+/** FUSE specific file data */
+struct fuse_file {
+	/** Request reserved for flush and release */
+	struct fuse_req *release_req;
+
+	/** File handle used by userspace */
+	u64 fh;
+};
+
+/** One input argument of a request */
+struct fuse_in_arg {
+	unsigned size;
+	const void *value;
+};
+
+/** The request input */
+struct fuse_in {
+	/** The request header */
+	struct fuse_in_header h;
+
+	/** True if the data for the last argument is in req->pages */
+	unsigned argpages:1;
+
+	/** Number of arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_in_arg args[3];
+};
+
+/** One output argument of a request */
+struct fuse_arg {
+	unsigned size;
+	void *value;
+};
+
+/** The request output */
+struct fuse_out {
+	/** Header returned from userspace */
+	struct fuse_out_header h;
+
+	/** Last argument is variable length (can be shorter than
+	    arg->size) */
+	unsigned argvar:1;
+
+	/** Last argument is a list of pages to copy data to */
+	unsigned argpages:1;
+
+	/** Zero partially or not copied pages */
+	unsigned page_zeroing:1;
+
+	/** Number or arguments */
+	unsigned numargs;
+
+	/** Array of arguments */
+	struct fuse_arg args[3];
+};
+
+struct fuse_req;
+struct fuse_conn;
+
+/**
+ * A request to the client
+ */
+struct fuse_req {
+	/** This can be on either unused_list, pending or processing
+	    lists in fuse_conn */
+	struct list_head list;
+
+	/** Entry on the background list */
+	struct list_head bg_entry;
+
+	/** refcount */
+	atomic_t count;
+
+	/** True if the request has reply */
+	unsigned isreply:1;
+
+	/** The request is preallocated */
+	unsigned preallocated:1;
+
+	/** The request was interrupted */
+	unsigned interrupted:1;
+
+	/** Request is sent in the background */
+	unsigned background:1;
+
+	/** Data is being copied to/from the request */
+	unsigned locked:1;
+
+	/** Request has been sent to userspace */
+	unsigned sent:1;
+
+	/** The request is finished */
+	unsigned finished:1;
+
+	/** The request input */
+	struct fuse_in in;
+
+	/** The request output */
+	struct fuse_out out;
+
+	/** Used to wake up the task waiting for completion of request*/
+	wait_queue_head_t waitq;
+
+	/** Data for asynchronous requests */
+	union {
+		struct fuse_forget_in forget_in;
+		struct fuse_release_in release_in;
+		struct fuse_init_in_out init_in_out;
+	} misc;
+
+	/** page vector */
+	struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+
+	/** number of pages in vector */
+	unsigned num_pages;
+
+	/** offset of data on first page */
+	unsigned page_offset;
+
+	/** Inode used in the request */
+	struct inode *inode;
+
+	/** Second inode used in the request (or NULL) */
+	struct inode *inode2;
+
+	/** File used in the request (or NULL) */
+	struct file *file;
+};
+
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the filesystem is mounted, and is
+ * destroyed, when the client device is closed and the filesystem is
+ * unmounted.
+ */
+struct fuse_conn {
+	/** Reference count */
+	int count;
+
+	/** The user id for this mount */
+	uid_t user_id;
+
+	/** The group id for this mount */
+	gid_t group_id;
+
+	/** The fuse mount flags for this mount */
+	unsigned flags;
+
+	/** Maximum read size */
+	unsigned max_read;
+
+	/** Maximum write size */
+	unsigned max_write;
+
+	/** Readers of the connection are waiting on this */
+	wait_queue_head_t waitq;
+
+	/** The list of pending requests */
+	struct list_head pending;
+
+	/** The list of requests being processed */
+	struct list_head processing;
+
+	/** Requests put in the background (RELEASE or any other
+	    interrupted request) */
+	struct list_head background;
+
+	/** Controls the maximum number of outstanding requests */
+	struct semaphore outstanding_sem;
+
+	/** This counts the number of outstanding requests if
+	    outstanding_sem would go negative */
+	unsigned outstanding_debt;
+
+	/** RW semaphore for exclusion with fuse_put_super() */
+	struct rw_semaphore sbput_sem;
+
+	/** The list of unused requests */
+	struct list_head unused_list;
+
+	/** The next unique request id */
+	u64 reqctr;
+
+	/** Mount is active */
+	unsigned mounted : 1;
+
+	/** Connection established */
+	unsigned connected : 1;
+
+	/** Connection failed (version mismatch) */
+	unsigned conn_error : 1;
+
+	/** Is fsync not implemented by fs? */
+	unsigned no_fsync : 1;
+
+	/** Is fsyncdir not implemented by fs? */
+	unsigned no_fsyncdir : 1;
+
+	/** Is flush not implemented by fs? */
+	unsigned no_flush : 1;
+
+	/** Is setxattr not implemented by fs? */
+	unsigned no_setxattr : 1;
+
+	/** Is getxattr not implemented by fs? */
+	unsigned no_getxattr : 1;
+
+	/** Is listxattr not implemented by fs? */
+	unsigned no_listxattr : 1;
+
+	/** Is removexattr not implemented by fs? */
+	unsigned no_removexattr : 1;
+
+	/** Backing dev info */
+	struct backing_dev_info bdi;
+};
+
+static inline struct fuse_conn **get_fuse_conn_super_p(struct super_block *sb)
+{
+	return (struct fuse_conn **) &sb->s_fs_info;
+}
+
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+	return *get_fuse_conn_super_p(sb);
+}
+
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+	return get_fuse_conn_super(inode->i_sb);
+}
+
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+	return container_of(inode, struct fuse_inode, inode);
+}
+
+static inline u64 get_node_id(struct inode *inode)
+{
+	return get_fuse_inode(inode)->nodeid;
+}
+
+/** Device operations */
+extern struct file_operations fuse_dev_operations;
+
+/**
+ * This is the single global spinlock which protects FUSE's structures
+ *
+ * The following data is protected by this lock:
+ *
+ *  - the private_data field of the device file
+ *  - the s_fs_info field of the super block
+ *  - unused_list, pending, processing lists in fuse_conn
+ *  - background list in fuse_conn
+ *  - the unique request ID counter reqctr in fuse_conn
+ *  - the sb (super_block) field in fuse_conn
+ *  - the file (device file) field in fuse_conn
+ */
+extern spinlock_t fuse_lock;
+
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+			int generation, struct fuse_attr *attr);
+
+/**
+ * Send FORGET command
+ */
+void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+		      unsigned long nodeid, u64 nlookup);
+
+/**
+ * Send READ or READDIR request
+ */
+size_t fuse_send_read_common(struct fuse_req *req, struct file *file,
+			     struct inode *inode, loff_t pos, size_t count,
+			     int isdir);
+
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, int isdir);
+
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+int fuse_release_common(struct inode *inode, struct file *file, int isdir);
+
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+		      int isdir);
+
+/**
+ * Initialise file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode);
+
+/**
+ * Initialise inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+
+/**
+ * Initialise inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+
+/**
+ * Initialise inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr);
+
+/**
+ * Check if the connection can be released, and if yes, then free the
+ * connection structure
+ */
+void fuse_release_conn(struct fuse_conn *fc);
+
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+
+/**
+ * Allocate a request
+ */
+struct fuse_req *fuse_request_alloc(void);
+
+/**
+ * Free a request
+ */
+void fuse_request_free(struct fuse_req *req);
+
+/**
+ * Reinitialize a request, the preallocated flag is left unmodified
+ */
+void fuse_reset_request(struct fuse_req *req);
+
+/**
+ * Reserve a preallocated request
+ */
+struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+
+/**
+ * Decrement reference count of a request.  If count goes to zero put
+ * on unused list (preallocated) or free reqest (not preallocated).
+ */
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request (synchronous)
+ */
+void request_send(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request with no reply
+ */
+void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request in the background
+ */
+void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Release inodes and file assiciated with background request
+ */
+void fuse_release_background(struct fuse_req *req);
+
+/**
+ * Get the attributes of a file
+ */
+int fuse_do_getattr(struct inode *inode);
+
+/**
+ * Invalidate inode attributes
+ */
+void fuse_invalidate_attr(struct inode *inode);
+
+/**
+ * Send the INIT message
+ */
+void fuse_send_init(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
new file mode 100644
index 00000000000..e69a546844d
--- /dev/null
+++ b/fs/fuse/inode.c
@@ -0,0 +1,591 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+
+spinlock_t fuse_lock;
+static kmem_cache_t *fuse_inode_cachep;
+
+#define FUSE_SUPER_MAGIC 0x65735546
+
+struct fuse_mount_data {
+	int fd;
+	unsigned rootmode;
+	unsigned user_id;
+	unsigned group_id;
+	unsigned fd_present : 1;
+	unsigned rootmode_present : 1;
+	unsigned user_id_present : 1;
+	unsigned group_id_present : 1;
+	unsigned flags;
+	unsigned max_read;
+};
+
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+
+	inode = kmem_cache_alloc(fuse_inode_cachep, SLAB_KERNEL);
+	if (!inode)
+		return NULL;
+
+	fi = get_fuse_inode(inode);
+	fi->i_time = jiffies - 1;
+	fi->nodeid = 0;
+	fi->nlookup = 0;
+	fi->forget_req = fuse_request_alloc();
+	if (!fi->forget_req) {
+		kmem_cache_free(fuse_inode_cachep, inode);
+		return NULL;
+	}
+
+	return inode;
+}
+
+static void fuse_destroy_inode(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	if (fi->forget_req)
+		fuse_request_free(fi->forget_req);
+	kmem_cache_free(fuse_inode_cachep, inode);
+}
+
+static void fuse_read_inode(struct inode *inode)
+{
+	/* No op */
+}
+
+void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
+		      unsigned long nodeid, u64 nlookup)
+{
+	struct fuse_forget_in *inarg = &req->misc.forget_in;
+	inarg->nlookup = nlookup;
+	req->in.h.opcode = FUSE_FORGET;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(struct fuse_forget_in);
+	req->in.args[0].value = inarg;
+	request_send_noreply(fc, req);
+}
+
+static void fuse_clear_inode(struct inode *inode)
+{
+	if (inode->i_sb->s_flags & MS_ACTIVE) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup);
+		fi->forget_req = NULL;
+	}
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
+{
+	if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
+		invalidate_inode_pages(inode->i_mapping);
+
+	inode->i_ino     = attr->ino;
+	inode->i_mode    = (inode->i_mode & S_IFMT) + (attr->mode & 07777);
+	inode->i_nlink   = attr->nlink;
+	inode->i_uid     = attr->uid;
+	inode->i_gid     = attr->gid;
+	i_size_write(inode, attr->size);
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks  = attr->blocks;
+	inode->i_atime.tv_sec   = attr->atime;
+	inode->i_atime.tv_nsec  = attr->atimensec;
+	inode->i_mtime.tv_sec   = attr->mtime;
+	inode->i_mtime.tv_nsec  = attr->mtimensec;
+	inode->i_ctime.tv_sec   = attr->ctime;
+	inode->i_ctime.tv_nsec  = attr->ctimensec;
+}
+
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+	inode->i_mode = attr->mode & S_IFMT;
+	i_size_write(inode, attr->size);
+	if (S_ISREG(inode->i_mode)) {
+		fuse_init_common(inode);
+		fuse_init_file_inode(inode);
+	} else if (S_ISDIR(inode->i_mode))
+		fuse_init_dir(inode);
+	else if (S_ISLNK(inode->i_mode))
+		fuse_init_symlink(inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+		 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+		fuse_init_common(inode);
+		init_special_inode(inode, inode->i_mode,
+				   new_decode_dev(attr->rdev));
+	} else {
+		/* Don't let user create weird files */
+		inode->i_mode = S_IFREG;
+		fuse_init_common(inode);
+		fuse_init_file_inode(inode);
+	}
+}
+
+static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+	unsigned long nodeid = *(unsigned long *) _nodeidp;
+	if (get_node_id(inode) == nodeid)
+		return 1;
+	else
+		return 0;
+}
+
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+	unsigned long nodeid = *(unsigned long *) _nodeidp;
+	get_fuse_inode(inode)->nodeid = nodeid;
+	return 0;
+}
+
+struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+			int generation, struct fuse_attr *attr)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	int retried = 0;
+
+ retry:
+	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+	if (!inode)
+		return NULL;
+
+	if ((inode->i_state & I_NEW)) {
+		inode->i_flags |= S_NOATIME|S_NOCMTIME;
+		inode->i_generation = generation;
+		inode->i_data.backing_dev_info = &fc->bdi;
+		fuse_init_inode(inode, attr);
+		unlock_new_inode(inode);
+	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+		BUG_ON(retried);
+		/* Inode has changed type, any I/O on the old should fail */
+		make_bad_inode(inode);
+		iput(inode);
+		retried = 1;
+		goto retry;
+	}
+
+	fi = get_fuse_inode(inode);
+	fi->nlookup ++;
+	fuse_change_attributes(inode, attr);
+	return inode;
+}
+
+static void fuse_put_super(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	down_write(&fc->sbput_sem);
+	while (!list_empty(&fc->background))
+		fuse_release_background(list_entry(fc->background.next,
+						   struct fuse_req, bg_entry));
+
+	spin_lock(&fuse_lock);
+	fc->mounted = 0;
+	fc->user_id = 0;
+	fc->group_id = 0;
+	fc->flags = 0;
+	/* Flush all readers on this fs */
+	wake_up_all(&fc->waitq);
+	up_write(&fc->sbput_sem);
+	fuse_release_conn(fc);
+	spin_unlock(&fuse_lock);
+}
+
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+	stbuf->f_type    = FUSE_SUPER_MAGIC;
+	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_blocks  = attr->blocks;
+	stbuf->f_bfree   = attr->bfree;
+	stbuf->f_bavail  = attr->bavail;
+	stbuf->f_files   = attr->files;
+	stbuf->f_ffree   = attr->ffree;
+	stbuf->f_namelen = attr->namelen;
+	/* fsid is left zero */
+}
+
+static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct fuse_req *req;
+	struct fuse_statfs_out outarg;
+	int err;
+
+        req = fuse_get_request(fc);
+	if (!req)
+		return -EINTR;
+
+	req->in.numargs = 0;
+	req->in.h.opcode = FUSE_STATFS;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	if (!err)
+		convert_fuse_statfs(buf, &outarg.st);
+	fuse_put_request(fc, req);
+	return err;
+}
+
+enum {
+	OPT_FD,
+	OPT_ROOTMODE,
+	OPT_USER_ID,
+	OPT_GROUP_ID,
+	OPT_DEFAULT_PERMISSIONS,
+	OPT_ALLOW_OTHER,
+	OPT_MAX_READ,
+	OPT_ERR
+};
+
+static match_table_t tokens = {
+	{OPT_FD,			"fd=%u"},
+	{OPT_ROOTMODE,			"rootmode=%o"},
+	{OPT_USER_ID,			"user_id=%u"},
+	{OPT_GROUP_ID,			"group_id=%u"},
+	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
+	{OPT_ALLOW_OTHER,		"allow_other"},
+	{OPT_MAX_READ,			"max_read=%u"},
+	{OPT_ERR,			NULL}
+};
+
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d)
+{
+	char *p;
+	memset(d, 0, sizeof(struct fuse_mount_data));
+	d->max_read = ~0;
+
+	while ((p = strsep(&opt, ",")) != NULL) {
+		int token;
+		int value;
+		substring_t args[MAX_OPT_ARGS];
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case OPT_FD:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->fd = value;
+			d->fd_present = 1;
+			break;
+
+		case OPT_ROOTMODE:
+			if (match_octal(&args[0], &value))
+				return 0;
+			d->rootmode = value;
+			d->rootmode_present = 1;
+			break;
+
+		case OPT_USER_ID:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->user_id = value;
+			d->user_id_present = 1;
+			break;
+
+		case OPT_GROUP_ID:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->group_id = value;
+			d->group_id_present = 1;
+			break;
+
+		case OPT_DEFAULT_PERMISSIONS:
+			d->flags |= FUSE_DEFAULT_PERMISSIONS;
+			break;
+
+		case OPT_ALLOW_OTHER:
+			d->flags |= FUSE_ALLOW_OTHER;
+			break;
+
+		case OPT_MAX_READ:
+			if (match_int(&args[0], &value))
+				return 0;
+			d->max_read = value;
+			break;
+
+		default:
+			return 0;
+		}
+	}
+
+	if (!d->fd_present || !d->rootmode_present ||
+	    !d->user_id_present || !d->group_id_present)
+		return 0;
+
+	return 1;
+}
+
+static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
+
+	seq_printf(m, ",user_id=%u", fc->user_id);
+	seq_printf(m, ",group_id=%u", fc->group_id);
+	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+		seq_puts(m, ",default_permissions");
+	if (fc->flags & FUSE_ALLOW_OTHER)
+		seq_puts(m, ",allow_other");
+	if (fc->max_read != ~0)
+		seq_printf(m, ",max_read=%u", fc->max_read);
+	return 0;
+}
+
+static void free_conn(struct fuse_conn *fc)
+{
+	while (!list_empty(&fc->unused_list)) {
+		struct fuse_req *req;
+		req = list_entry(fc->unused_list.next, struct fuse_req, list);
+		list_del(&req->list);
+		fuse_request_free(req);
+	}
+	kfree(fc);
+}
+
+/* Must be called with the fuse lock held */
+void fuse_release_conn(struct fuse_conn *fc)
+{
+	fc->count--;
+	if (!fc->count)
+		free_conn(fc);
+}
+
+static struct fuse_conn *new_conn(void)
+{
+	struct fuse_conn *fc;
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	if (fc != NULL) {
+		int i;
+		memset(fc, 0, sizeof(*fc));
+		init_waitqueue_head(&fc->waitq);
+		INIT_LIST_HEAD(&fc->pending);
+		INIT_LIST_HEAD(&fc->processing);
+		INIT_LIST_HEAD(&fc->unused_list);
+		INIT_LIST_HEAD(&fc->background);
+		sema_init(&fc->outstanding_sem, 0);
+		init_rwsem(&fc->sbput_sem);
+		for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
+			struct fuse_req *req = fuse_request_alloc();
+			if (!req) {
+				free_conn(fc);
+				return NULL;
+			}
+			list_add(&req->list, &fc->unused_list);
+		}
+		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		fc->reqctr = 0;
+	}
+	return fc;
+}
+
+static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
+{
+	struct fuse_conn *fc;
+
+	if (file->f_op != &fuse_dev_operations)
+		return ERR_PTR(-EINVAL);
+	fc = new_conn();
+	if (fc == NULL)
+		return ERR_PTR(-ENOMEM);
+	spin_lock(&fuse_lock);
+	if (file->private_data) {
+		free_conn(fc);
+		fc = ERR_PTR(-EINVAL);
+	} else {
+		file->private_data = fc;
+		*get_fuse_conn_super_p(sb) = fc;
+		fc->mounted = 1;
+		fc->connected = 1;
+		fc->count = 2;
+	}
+	spin_unlock(&fuse_lock);
+	return fc;
+}
+
+static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+{
+	struct fuse_attr attr;
+	memset(&attr, 0, sizeof(attr));
+
+	attr.mode = mode;
+	attr.ino = FUSE_ROOT_ID;
+	return fuse_iget(sb, 1, 0, &attr);
+}
+
+static struct super_operations fuse_super_operations = {
+	.alloc_inode    = fuse_alloc_inode,
+	.destroy_inode  = fuse_destroy_inode,
+	.read_inode	= fuse_read_inode,
+	.clear_inode	= fuse_clear_inode,
+	.put_super	= fuse_put_super,
+	.statfs		= fuse_statfs,
+	.show_options	= fuse_show_options,
+};
+
+static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct fuse_conn *fc;
+	struct inode *root;
+	struct fuse_mount_data d;
+	struct file *file;
+	int err;
+
+	if (!parse_fuse_opt((char *) data, &d))
+		return -EINVAL;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = FUSE_SUPER_MAGIC;
+	sb->s_op = &fuse_super_operations;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	file = fget(d.fd);
+	if (!file)
+		return -EINVAL;
+
+	fc = get_conn(file, sb);
+	fput(file);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	fc->flags = d.flags;
+	fc->user_id = d.user_id;
+	fc->group_id = d.group_id;
+	fc->max_read = d.max_read;
+	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
+		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+	fc->max_write = FUSE_MAX_IN / 2;
+
+	err = -ENOMEM;
+	root = get_root_inode(sb, d.rootmode);
+	if (root == NULL)
+		goto err;
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		goto err;
+	}
+	fuse_send_init(fc);
+	return 0;
+
+ err:
+	spin_lock(&fuse_lock);
+	fuse_release_conn(fc);
+	spin_unlock(&fuse_lock);
+	return err;
+}
+
+static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *raw_data)
+{
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+}
+
+static struct file_system_type fuse_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fuse",
+	.get_sb		= fuse_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
+				 unsigned long flags)
+{
+	struct inode * inode = foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(inode);
+}
+
+static int __init fuse_fs_init(void)
+{
+	int err;
+
+	err = register_filesystem(&fuse_fs_type);
+	if (err)
+		printk("fuse: failed to register filesystem\n");
+	else {
+		fuse_inode_cachep = kmem_cache_create("fuse_inode",
+						      sizeof(struct fuse_inode),
+						      0, SLAB_HWCACHE_ALIGN,
+						      fuse_inode_init_once, NULL);
+		if (!fuse_inode_cachep) {
+			unregister_filesystem(&fuse_fs_type);
+			err = -ENOMEM;
+		}
+	}
+
+	return err;
+}
+
+static void fuse_fs_cleanup(void)
+{
+	unregister_filesystem(&fuse_fs_type);
+	kmem_cache_destroy(fuse_inode_cachep);
+}
+
+static int __init fuse_init(void)
+{
+	int res;
+
+	printk("fuse init (API version %i.%i)\n",
+	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+
+	spin_lock_init(&fuse_lock);
+	res = fuse_fs_init();
+	if (res)
+		goto err;
+
+	res = fuse_dev_init();
+	if (res)
+		goto err_fs_cleanup;
+
+	return 0;
+
+ err_fs_cleanup:
+	fuse_fs_cleanup();
+ err:
+	return res;
+}
+
+static void __exit fuse_exit(void)
+{
+	printk(KERN_DEBUG "fuse exit\n");
+
+	fuse_fs_cleanup();
+	fuse_dev_cleanup();
+}
+
+module_init(fuse_init);
+module_exit(fuse_exit);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index a096c5a5666..3d5cdc6847c 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -13,8 +13,6 @@
 
 #include "btree.h"
 
-#define REF_PAGES	0
-
 void hfs_bnode_read(struct hfs_bnode *node, void *buf,
 		int off, int len)
 {
@@ -289,9 +287,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 			page_cache_release(page);
 			goto fail;
 		}
-#if !REF_PAGES
 		page_cache_release(page);
-#endif
 		node->page[i] = page;
 	}
 
@@ -449,13 +445,6 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
 	if (node) {
 		atomic_inc(&node->refcnt);
-#if REF_PAGES
-		{
-		int i;
-		for (i = 0; i < node->tree->pages_per_bnode; i++)
-			get_page(node->page[i]);
-		}
-#endif
 		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
 		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
 	}
@@ -472,20 +461,12 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
 		if (!atomic_read(&node->refcnt))
 			BUG();
-		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) {
-#if REF_PAGES
-			for (i = 0; i < tree->pages_per_bnode; i++)
-				put_page(node->page[i]);
-#endif
+		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
 			return;
-		}
 		for (i = 0; i < tree->pages_per_bnode; i++) {
 			if (!node->page[i])
 				continue;
 			mark_page_accessed(node->page[i]);
-#if REF_PAGES
-			put_page(node->page[i]);
-#endif
 		}
 
 		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 65dedefcabf..2fcd679f023 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -20,12 +20,12 @@
  *
  * Given the ID of the parent and the name build a search key.
  */
-void hfs_cat_build_key(btree_key *key, u32 parent, struct qstr *name)
+void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name)
 {
 	key->cat.reserved = 0;
 	key->cat.ParID = cpu_to_be32(parent);
 	if (name) {
-		hfs_triv2mac(&key->cat.CName, name);
+		hfs_asc2mac(sb, &key->cat.CName, name);
 		key->key_len = 6 + key->cat.CName.len;
 	} else {
 		memset(&key->cat.CName, 0, sizeof(struct hfs_name));
@@ -62,13 +62,14 @@ static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
 	}
 }
 
-static int hfs_cat_build_thread(hfs_cat_rec *rec, int type,
+static int hfs_cat_build_thread(struct super_block *sb,
+				hfs_cat_rec *rec, int type,
 				u32 parentid, struct qstr *name)
 {
 	rec->type = type;
 	memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
 	rec->thread.ParID = cpu_to_be32(parentid);
-	hfs_triv2mac(&rec->thread.CName, name);
+	hfs_asc2mac(sb, &rec->thread.CName, name);
 	return sizeof(struct hfs_cat_thread);
 }
 
@@ -93,8 +94,8 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
 	sb = dir->i_sb;
 	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
 
-	hfs_cat_build_key(fd.search_key, cnid, NULL);
-	entry_size = hfs_cat_build_thread(&entry, S_ISDIR(inode->i_mode) ?
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
+	entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
 			HFS_CDR_THD : HFS_CDR_FTH,
 			dir->i_ino, str);
 	err = hfs_brec_find(&fd);
@@ -107,7 +108,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
 	if (err)
 		goto err2;
 
-	hfs_cat_build_key(fd.search_key, dir->i_ino, str);
+	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
 	entry_size = hfs_cat_build_record(&entry, cnid, inode);
 	err = hfs_brec_find(&fd);
 	if (err != -ENOENT) {
@@ -127,7 +128,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
 	return 0;
 
 err1:
-	hfs_cat_build_key(fd.search_key, cnid, NULL);
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
 	if (!hfs_brec_find(&fd))
 		hfs_brec_remove(&fd);
 err2:
@@ -176,7 +177,7 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
 	hfs_cat_rec rec;
 	int res, len, type;
 
-	hfs_cat_build_key(fd->search_key, cnid, NULL);
+	hfs_cat_build_key(sb, fd->search_key, cnid, NULL);
 	res = hfs_brec_read(fd, &rec, sizeof(rec));
 	if (res)
 		return res;
@@ -211,7 +212,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
 	sb = dir->i_sb;
 	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
 
-	hfs_cat_build_key(fd.search_key, dir->i_ino, str);
+	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
 	res = hfs_brec_find(&fd);
 	if (res)
 		goto out;
@@ -239,7 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
 	if (res)
 		goto out;
 
-	hfs_cat_build_key(fd.search_key, cnid, NULL);
+	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
 	res = hfs_brec_find(&fd);
 	if (!res) {
 		res = hfs_brec_remove(&fd);
@@ -280,7 +281,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 	dst_fd = src_fd;
 
 	/* find the old dir entry and read the data */
-	hfs_cat_build_key(src_fd.search_key, src_dir->i_ino, src_name);
+	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
 	err = hfs_brec_find(&src_fd);
 	if (err)
 		goto out;
@@ -289,7 +290,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 			    src_fd.entrylength);
 
 	/* create new dir entry with the data from the old entry */
-	hfs_cat_build_key(dst_fd.search_key, dst_dir->i_ino, dst_name);
+	hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
 	err = hfs_brec_find(&dst_fd);
 	if (err != -ENOENT) {
 		if (!err)
@@ -305,7 +306,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 	mark_inode_dirty(dst_dir);
 
 	/* finally remove the old entry */
-	hfs_cat_build_key(src_fd.search_key, src_dir->i_ino, src_name);
+	hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
 	err = hfs_brec_find(&src_fd);
 	if (err)
 		goto out;
@@ -321,7 +322,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 		goto out;
 
 	/* remove old thread entry */
-	hfs_cat_build_key(src_fd.search_key, cnid, NULL);
+	hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL);
 	err = hfs_brec_find(&src_fd);
 	if (err)
 		goto out;
@@ -330,8 +331,8 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 		goto out;
 
 	/* create new thread entry */
-	hfs_cat_build_key(dst_fd.search_key, cnid, NULL);
-	entry_size = hfs_cat_build_thread(&entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
+	hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
+	entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
 					dst_dir->i_ino, dst_name);
 	err = hfs_brec_find(&dst_fd);
 	if (err != -ENOENT) {
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index c55998262ae..e1f24befba5 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -28,7 +28,7 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
 	dentry->d_op = &hfs_dentry_operations;
 
 	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
-	hfs_cat_build_key(fd.search_key, dir->i_ino, &dentry->d_name);
+	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
 	res = hfs_brec_read(&fd, &rec, sizeof(rec));
 	if (res) {
 		hfs_find_exit(&fd);
@@ -56,7 +56,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
 	int len, err;
-	char strbuf[HFS_NAMELEN + 1];
+	char strbuf[HFS_MAX_NAMELEN];
 	union hfs_cat_rec entry;
 	struct hfs_find_data fd;
 	struct hfs_readdir_data *rd;
@@ -66,7 +66,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		return 0;
 
 	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
-	hfs_cat_build_key(fd.search_key, inode->i_ino, NULL);
+	hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
 	err = hfs_brec_find(&fd);
 	if (err)
 		goto out;
@@ -111,7 +111,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
 		type = entry.type;
-		len = hfs_mac2triv(strbuf, &fd.key->cat.CName);
+		len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
 		if (type == HFS_CDR_DIR) {
 			if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
 				printk("HFS: small dir entry\n");
@@ -307,7 +307,8 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   old_dir, &old_dentry->d_name,
 			   new_dir, &new_dentry->d_name);
 	if (!res)
-		hfs_cat_build_key((btree_key *)&HFS_I(old_dentry->d_inode)->cat_key,
+		hfs_cat_build_key(old_dir->i_sb,
+				  (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key,
 				  new_dir->i_ino, &new_dentry->d_name);
 	return res;
 }
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
index df6b33adee3..88099ab1a18 100644
--- a/fs/hfs/hfs.h
+++ b/fs/hfs/hfs.h
@@ -25,6 +25,7 @@
 #define HFS_SECTOR_SIZE		512    /* size of an HFS sector */
 #define HFS_SECTOR_SIZE_BITS	9      /* log_2(HFS_SECTOR_SIZE) */
 #define HFS_NAMELEN		31     /* maximum length of an HFS filename */
+#define HFS_MAX_NAMELEN		128
 #define HFS_MAX_VALENCE		32767U
 
 /* Meanings of the drAtrb field of the MDB,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 0dc8ef8e14d..aae019aadf8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -141,6 +141,8 @@ struct hfs_sb_info {
 
 	int session, part;
 
+	struct nls_table *nls_io, *nls_disk;
+
 	struct semaphore bitmap_lock;
 
 	unsigned long flags;
@@ -168,7 +170,7 @@ extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *);
 extern int hfs_cat_delete(u32, struct inode *, struct qstr *);
 extern int hfs_cat_move(u32, struct inode *, struct qstr *,
 			struct inode *, struct qstr *);
-extern void hfs_cat_build_key(btree_key *, u32, struct qstr *);
+extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
 
 /* dir.c */
 extern struct file_operations hfs_dir_operations;
@@ -222,8 +224,8 @@ extern int hfs_strcmp(const unsigned char *, unsigned int,
 extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 
 /* trans.c */
-extern void hfs_triv2mac(struct hfs_name *, struct qstr *);
-extern int hfs_mac2triv(char *, const struct hfs_name *);
+extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
+extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
 
 extern struct timezone sys_tz;
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 75191232609..f1570b9f9de 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -160,7 +160,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 
 	init_MUTEX(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
-	hfs_cat_build_key((btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
+	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
 	inode->i_ino = HFS_SB(sb)->next_id++;
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 217e32f37e0..0a473f79c89 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -10,6 +10,7 @@
 
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
+#include <linux/nls.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
@@ -343,6 +344,11 @@ void hfs_mdb_put(struct super_block *sb)
 	brelse(HFS_SB(sb)->mdb_bh);
 	brelse(HFS_SB(sb)->alt_mdb_bh);
 
+	if (HFS_SB(sb)->nls_io)
+		unload_nls(HFS_SB(sb)->nls_io);
+	if (HFS_SB(sb)->nls_disk)
+		unload_nls(HFS_SB(sb)->nls_disk);
+
 	kfree(HFS_SB(sb));
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ab783f6afa3..c5074aeafca 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -15,8 +15,11 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/mount.h>
 #include <linux/init.h>
+#include <linux/nls.h>
 #include <linux/parser.h>
+#include <linux/seq_file.h>
 #include <linux/vfs.h>
 
 #include "hfs_fs.h"
@@ -111,6 +114,32 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
+static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb);
+
+	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
+		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+	if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
+		seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+	seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid);
+	if (sbi->s_file_umask != 0133)
+		seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
+	if (sbi->s_dir_umask != 0022)
+		seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask);
+	if (sbi->part >= 0)
+		seq_printf(seq, ",part=%u", sbi->part);
+	if (sbi->session >= 0)
+		seq_printf(seq, ",session=%u", sbi->session);
+	if (sbi->nls_disk)
+		seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset);
+	if (sbi->nls_io)
+		seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset);
+	if (sbi->s_quiet)
+		seq_printf(seq, ",quiet");
+	return 0;
+}
+
 static struct inode *hfs_alloc_inode(struct super_block *sb)
 {
 	struct hfs_inode_info *i;
@@ -133,11 +162,13 @@ static struct super_operations hfs_super_operations = {
 	.write_super	= hfs_write_super,
 	.statfs		= hfs_statfs,
 	.remount_fs     = hfs_remount,
+	.show_options	= hfs_show_options,
 };
 
 enum {
 	opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
 	opt_part, opt_session, opt_type, opt_creator, opt_quiet,
+	opt_codepage, opt_iocharset,
 	opt_err
 };
 
@@ -152,6 +183,8 @@ static match_table_t tokens = {
 	{ opt_type, "type=%s" },
 	{ opt_creator, "creator=%s" },
 	{ opt_quiet, "quiet" },
+	{ opt_codepage, "codepage=%s" },
+	{ opt_iocharset, "iocharset=%s" },
 	{ opt_err, NULL }
 };
 
@@ -257,11 +290,46 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 		case opt_quiet:
 			hsb->s_quiet = 1;
 			break;
+		case opt_codepage:
+			if (hsb->nls_disk) {
+				printk("HFS+-fs: unable to change codepage\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			hsb->nls_disk = load_nls(p);
+			if (!hsb->nls_disk) {
+				printk("HFS+-fs: unable to load codepage \"%s\"\n", p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
+		case opt_iocharset:
+			if (hsb->nls_io) {
+				printk("HFS: unable to change iocharset\n");
+				return 0;
+			}
+			p = match_strdup(&args[0]);
+			hsb->nls_io = load_nls(p);
+			if (!hsb->nls_io) {
+				printk("HFS: unable to load iocharset \"%s\"\n", p);
+				kfree(p);
+				return 0;
+			}
+			kfree(p);
+			break;
 		default:
 			return 0;
 		}
 	}
 
+	if (hsb->nls_disk && !hsb->nls_io) {
+		hsb->nls_io = load_nls_default();
+		if (!hsb->nls_io) {
+			printk("HFS: unable to load default iocharset\n");
+			return 0;
+		}
+	}
 	hsb->s_dir_umask &= 0777;
 	hsb->s_file_umask &= 0577;
 
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index fb9720abbad..e673a88b8ae 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -9,12 +9,15 @@
  * with ':' vs. '/' as the path-element separator.
  */
 
+#include <linux/types.h>
+#include <linux/nls.h>
+
 #include "hfs_fs.h"
 
 /*================ Global functions ================*/
 
 /*
- * hfs_mac2triv()
+ * hfs_mac2asc()
  *
  * Given a 'Pascal String' (a string preceded by a length byte) in
  * the Macintosh character set produce the corresponding filename using
@@ -27,23 +30,58 @@
  * by ':' which never appears in HFS filenames.	 All other characters
  * are passed unchanged from input to output.
  */
-int hfs_mac2triv(char *out, const struct hfs_name *in)
+int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
 {
-	const char *p;
-	char c;
-	int i, len;
+	struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+	struct nls_table *nls_io = HFS_SB(sb)->nls_io;
+	const char *src;
+	char *dst;
+	int srclen, dstlen, size;
+
+	src = in->name;
+	srclen = in->len;
+	dst = out;
+	dstlen = HFS_MAX_NAMELEN;
+	if (nls_io) {
+		wchar_t ch;
 
-	len = in->len;
-	p = in->name;
-	for (i = 0; i < len; i++) {
-		c = *p++;
-		*out++ = c == '/' ? ':' : c;
+		while (srclen > 0) {
+			if (nls_disk) {
+				size = nls_disk->char2uni(src, srclen, &ch);
+				if (size <= 0) {
+					ch = '?';
+					size = 1;
+				}
+				src += size;
+				srclen -= size;
+			} else {
+				ch = *src++;
+				srclen--;
+			}
+			if (ch == '/')
+				ch = ':';
+			size = nls_io->uni2char(ch, dst, dstlen);
+			if (size < 0) {
+				if (size == -ENAMETOOLONG)
+					goto out;
+				*dst = '?';
+				size = 1;
+			}
+			dst += size;
+			dstlen -= size;
+		}
+	} else {
+		char ch;
+
+		while (--srclen >= 0)
+			*dst++ = (ch = *src++) == '/' ? ':' : ch;
 	}
-	return i;
+out:
+	return dst - out;
 }
 
 /*
- * hfs_triv2mac()
+ * hfs_asc2mac()
  *
  * Given an ASCII string (not null-terminated) and its length,
  * generate the corresponding filename in the Macintosh character set
@@ -54,19 +92,57 @@ int hfs_mac2triv(char *out, const struct hfs_name *in)
  * This routine is a inverse to hfs_mac2triv().
  * A ':' is replaced by a '/'.
  */
-void hfs_triv2mac(struct hfs_name *out, struct qstr *in)
+void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in)
 {
+	struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+	struct nls_table *nls_io = HFS_SB(sb)->nls_io;
 	const char *src;
-	char *dst, c;
-	int i, len;
+	char *dst;
+	int srclen, dstlen, size;
 
-	out->len = len = min((unsigned int)HFS_NAMELEN, in->len);
 	src = in->name;
+	srclen = in->len;
 	dst = out->name;
-	for (i = 0; i < len; i++) {
-		c = *src++;
-		*dst++ = c == ':' ? '/' : c;
+	dstlen = HFS_NAMELEN;
+	if (nls_io) {
+		wchar_t ch;
+
+		while (srclen > 0) {
+			size = nls_io->char2uni(src, srclen, &ch);
+			if (size < 0) {
+				ch = '?';
+				size = 1;
+			}
+			src += size;
+			srclen -= size;
+			if (ch == ':')
+				ch = '/';
+			if (nls_disk) {
+				size = nls_disk->uni2char(ch, dst, dstlen);
+				if (size < 0) {
+					if (size == -ENAMETOOLONG)
+						goto out;
+					*dst = '?';
+					size = 1;
+				}
+				dst += size;
+				dstlen -= size;
+			} else {
+				*dst++ = ch > 0xff ? '?' : ch;
+				dstlen--;
+			}
+		}
+	} else {
+		char ch;
+
+		if (dstlen > srclen)
+			dstlen = srclen;
+		while (--dstlen >= 0)
+			*dst++ = (ch = *src++) == ':' ? '/' : ch;
 	}
-	for (; i < HFS_NAMELEN; i++)
+out:
+	out->len = dst - (char *)out->name;
+	dstlen = HFS_NAMELEN - out->len;
+	while (--dstlen >= 0)
 		*dst++ = 0;
 }
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 8868d3b766f..b85abc6e6f8 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -18,8 +18,6 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 
-#define REF_PAGES	0
-
 /* Copy a specified range of bytes from the raw data of a node */
 void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 {
@@ -450,9 +448,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 			page_cache_release(page);
 			goto fail;
 		}
-#if !REF_PAGES
 		page_cache_release(page);
-#endif
 		node->page[i] = page;
 	}
 
@@ -612,13 +608,6 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
 	if (node) {
 		atomic_inc(&node->refcnt);
-#if REF_PAGES
-		{
-		int i;
-		for (i = 0; i < node->tree->pages_per_bnode; i++)
-			get_page(node->page[i]);
-		}
-#endif
 		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
 		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
 	}
@@ -635,20 +624,12 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
 		if (!atomic_read(&node->refcnt))
 			BUG();
-		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) {
-#if REF_PAGES
-			for (i = 0; i < tree->pages_per_bnode; i++)
-				put_page(node->page[i]);
-#endif
+		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
 			return;
-		}
 		for (i = 0; i < tree->pages_per_bnode; i++) {
 			if (!node->page[i])
 				continue;
 			mark_page_accessed(node->page[i]);
-#if REF_PAGES
-			put_page(node->page[i]);
-#endif
 		}
 
 		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 533094a570d..2bc0cdd30e5 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -343,8 +343,9 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 /* options.c */
-int parse_options(char *, struct hfsplus_sb_info *);
-void fill_defaults(struct hfsplus_sb_info *);
+int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+void hfsplus_fill_defaults(struct hfsplus_sb_info *);
+int hfsplus_show_options(struct seq_file *, struct vfsmount *);
 
 /* tables.c */
 extern u16 hfsplus_case_fold_table[];
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 1cca0102c98..cca0818aa4c 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -13,6 +13,8 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/nls.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
 #include "hfsplus_fs.h"
 
 enum {
@@ -38,7 +40,7 @@ static match_table_t tokens = {
 };
 
 /* Initialize an options object to reasonable defaults */
-void fill_defaults(struct hfsplus_sb_info *opts)
+void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
 {
 	if (!opts)
 		return;
@@ -63,7 +65,7 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
 
 /* Parse options from mount. Returns 0 on failure */
 /* input is the options passed to mount() as a string */
-int parse_options(char *input, struct hfsplus_sb_info *sbi)
+int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -160,3 +162,23 @@ done:
 
 	return 1;
 }
+
+int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+	struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+
+	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
+		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
+		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid);
+	if (sbi->part >= 0)
+		seq_printf(seq, ",part=%u", sbi->part);
+	if (sbi->session >= 0)
+		seq_printf(seq, ",session=%u", sbi->session);
+	if (sbi->nls)
+		seq_printf(seq, ",nls=%s", sbi->nls->charset);
+	if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+		seq_printf(seq, ",nodecompose");
+	return 0;
+}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index d55ad67b8e4..fd0f0f050e1 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -217,8 +217,7 @@ static void hfsplus_put_super(struct super_block *sb)
 		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
 		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
 		mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-		ll_rw_block(WRITE, 1, &HFSPLUS_SB(sb).s_vhbh);
-		wait_on_buffer(HFSPLUS_SB(sb).s_vhbh);
+		sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
 	}
 
 	hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
@@ -277,6 +276,7 @@ static struct super_operations hfsplus_sops = {
 	.write_super	= hfsplus_write_super,
 	.statfs		= hfsplus_statfs,
 	.remount_fs	= hfsplus_remount,
+	.show_options	= hfsplus_show_options,
 };
 
 static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
@@ -297,8 +297,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	memset(sbi, 0, sizeof(HFSPLUS_SB(sb)));
 	sb->s_fs_info = sbi;
 	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
-	fill_defaults(sbi);
-	if (!parse_options(data, sbi)) {
+	hfsplus_fill_defaults(sbi);
+	if (!hfsplus_parse_options(data, sbi)) {
 		if (!silent)
 			printk("HFS+-fs: unable to parse mount options\n");
 		err = -EINVAL;
@@ -415,8 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
 	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
 	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-	ll_rw_block(WRITE, 1, &HFSPLUS_SB(sb).s_vhbh);
-	wait_on_buffer(HFSPLUS_SB(sb).s_vhbh);
+	sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
 
 	if (!HFSPLUS_SB(sb).hidden_dir) {
 		printk("HFS+: create hidden dir...\n");
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 67bca0d4a33..cca3fb693f9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -49,7 +49,6 @@ struct hostfs_iattr {
 	struct timespec	ia_atime;
 	struct timespec	ia_mtime;
 	struct timespec	ia_ctime;
-	unsigned int	ia_attr_flags;
 };
 
 extern int stat_file(const char *path, unsigned long long *inode_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b2d18200a00..59c5062cd63 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -284,6 +284,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 
 static void hostfs_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	if(HOSTFS_I(inode)->fd != -1) {
 		close_file(&HOSTFS_I(inode)->fd);
 		HOSTFS_I(inode)->fd = -1;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 38b1741fa53..e3d17e9ea6c 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -284,6 +284,7 @@ void hpfs_write_if_changed(struct inode *inode)
 
 void hpfs_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	lock_kernel();
 	hpfs_remove_fnode(inode->i_sb, inode->i_ino);
 	unlock_kernel();
diff --git a/fs/inode.c b/fs/inode.c
index e57f1724db3..f80a79ff156 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1034,19 +1034,21 @@ void generic_delete_inode(struct inode *inode)
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 
-	if (inode->i_data.nrpages)
-		truncate_inode_pages(&inode->i_data, 0);
-
 	security_inode_delete(inode);
 
 	if (op->delete_inode) {
 		void (*delete)(struct inode *) = op->delete_inode;
 		if (!is_bad_inode(inode))
 			DQUOT_INIT(inode);
-		/* s_op->delete_inode internally recalls clear_inode() */
+		/* Filesystems implementing their own
+		 * s_op->delete_inode are required to call
+		 * truncate_inode_pages and clear_inode()
+		 * internally */
 		delete(inode);
-	} else
+	} else {
+		truncate_inode_pages(&inode->i_data, 0);
 		clear_inode(inode);
+	}
 	spin_lock(&inode_lock);
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode_lock);
@@ -1195,9 +1197,6 @@ void update_atime(struct inode *inode)
 	if (!timespec_equal(&inode->i_atime, &now)) {
 		inode->i_atime = now;
 		mark_inode_dirty_sync(inode);
-	} else {
-		if (!timespec_equal(&inode->i_atime, &now))
-			inode->i_atime = now;
 	}
 }
 
diff --git a/fs/inotify.c b/fs/inotify.c
index 2e4e2a57708..a37e9fb1da5 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -37,6 +37,7 @@
 #include <asm/ioctls.h>
 
 static atomic_t inotify_cookie;
+static atomic_t inotify_watches;
 
 static kmem_cache_t *watch_cachep;
 static kmem_cache_t *event_cachep;
@@ -422,6 +423,7 @@ static struct inotify_watch *create_watch(struct inotify_device *dev,
 	get_inotify_watch(watch);
 
 	atomic_inc(&dev->user->inotify_watches);
+	atomic_inc(&inotify_watches);
 
 	return watch;
 }
@@ -454,6 +456,7 @@ static void remove_watch_no_event(struct inotify_watch *watch,
 	list_del(&watch->d_list);
 
 	atomic_dec(&dev->user->inotify_watches);
+	atomic_dec(&inotify_watches);
 	idr_remove(&dev->idr, watch->wd);
 	put_inotify_watch(watch);
 }
@@ -532,6 +535,9 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
 	struct dentry *parent;
 	struct inode *inode;
 
+	if (!atomic_read (&inotify_watches))
+		return;
+
 	spin_lock(&dentry->d_lock);
 	parent = dentry->d_parent;
 	inode = parent->d_inode;
@@ -925,6 +931,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	struct nameidata nd;
 	struct file *filp;
 	int ret, fput_needed;
+	int mask_add = 0;
 
 	filp = fget_light(fd, &fput_needed);
 	if (unlikely(!filp))
@@ -947,6 +954,9 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	down(&inode->inotify_sem);
 	down(&dev->sem);
 
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
+
 	/* don't let user-space set invalid bits: we don't want flags set */
 	mask &= IN_ALL_EVENTS;
 	if (unlikely(!mask)) {
@@ -960,7 +970,10 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	 */
 	old = inode_find_dev(inode, dev);
 	if (unlikely(old)) {
-		old->mask = mask;
+		if (mask_add)
+			old->mask |= mask;
+		else
+			old->mask = mask;
 		ret = old->wd;
 		goto out;
 	}
@@ -1043,6 +1056,7 @@ static int __init inotify_setup(void)
 	inotify_max_user_watches = 8192;
 
 	atomic_set(&inotify_cookie, 0);
+	atomic_set(&inotify_watches, 0);
 
 	watch_cachep = kmem_cache_create("inotify_watch_cache",
 					 sizeof(struct inotify_watch),
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5a97e346bd9..014a51fd00d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -204,7 +204,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 	int i;
 
 	spin_unlock(&journal->j_list_lock);
-	ll_rw_block(WRITE, *batch_count, bhs);
+	ll_rw_block(SWRITE, *batch_count, bhs);
 	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index dac720c837a..2a3e310f79e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -358,7 +358,7 @@ write_out_data:
 					jbd_debug(2, "submit %d writes\n",
 							bufs);
 					spin_unlock(&journal->j_list_lock);
-					ll_rw_block(WRITE, bufs, wbuf);
+					ll_rw_block(SWRITE, bufs, wbuf);
 					journal_brelse_array(wbuf, bufs);
 					bufs = 0;
 					goto write_out_data;
@@ -381,7 +381,7 @@ write_out_data:
 
 	if (bufs) {
 		spin_unlock(&journal->j_list_lock);
-		ll_rw_block(WRITE, bufs, wbuf);
+		ll_rw_block(SWRITE, bufs, wbuf);
 		journal_brelse_array(wbuf, bufs);
 		spin_lock(&journal->j_list_lock);
 	}
@@ -720,11 +720,17 @@ wait_for_iobuf:
 	J_ASSERT(commit_transaction->t_log_list == NULL);
 
 restart_loop:
+	/*
+	 * As there are other places (journal_unmap_buffer()) adding buffers
+	 * to this list we have to be careful and hold the j_list_lock.
+	 */
+	spin_lock(&journal->j_list_lock);
 	while (commit_transaction->t_forget) {
 		transaction_t *cp_transaction;
 		struct buffer_head *bh;
 
 		jh = commit_transaction->t_forget;
+		spin_unlock(&journal->j_list_lock);
 		bh = jh2bh(jh);
 		jbd_lock_bh_state(bh);
 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
@@ -792,9 +798,25 @@ restart_loop:
 			journal_remove_journal_head(bh);  /* needs a brelse */
 			release_buffer_page(bh);
 		}
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	/*
+	 * This is a bit sleazy.  We borrow j_list_lock to protect
+	 * journal->j_committing_transaction in __journal_remove_checkpoint.
+	 * Really, __journal_remove_checkpoint should be using j_state_lock but
+	 * it's a bit hassle to hold that across __journal_remove_checkpoint
+	 */
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	/*
+	 * Now recheck if some buffers did not get attached to the transaction
+	 * while the lock was dropped...
+	 */
+	if (commit_transaction->t_forget) {
 		spin_unlock(&journal->j_list_lock);
-		if (cond_resched())
-			goto restart_loop;
+		spin_unlock(&journal->j_state_lock);
+		goto restart_loop;
 	}
 
 	/* Done with this transaction! */
@@ -803,14 +825,6 @@ restart_loop:
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
-	/*
-	 * This is a bit sleazy.  We borrow j_list_lock to protect
-	 * journal->j_committing_transaction in __journal_remove_checkpoint.
-	 * Really, __jornal_remove_checkpoint should be using j_state_lock but
-	 * it's a bit hassle to hold that across __journal_remove_checkpoint
-	 */
-	spin_lock(&journal->j_state_lock);
-	spin_lock(&journal->j_list_lock);
 	commit_transaction->t_state = T_FINISHED;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 5e7b4394951..7ae2c4fe506 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -65,7 +65,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_recover);
 EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
@@ -81,6 +80,7 @@ EXPORT_SYMBOL(journal_try_to_free_buffers);
 EXPORT_SYMBOL(journal_force_commit);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
 
 /*
  * Helper function used to manage commit timeouts
@@ -93,16 +93,6 @@ static void commit_timeout(unsigned long __data)
 	wake_up_process(p);
 }
 
-/* Static check for data structure consistency.  There's no code
- * invoked --- we'll just get a linker failure if things aren't right.
- */
-void __journal_internal_check(void)
-{
-	extern void journal_bad_superblock_size(void);
-	if (sizeof(struct journal_superblock_s) != 1024)
-		journal_bad_superblock_size();
-}
-
 /*
  * kjournald: The main thread function used to manage a logging device
  * journal.
@@ -119,16 +109,12 @@ void __journal_internal_check(void)
  *    known as checkpointing, and this thread is responsible for that job.
  */
 
-journal_t *current_journal;		// AKPM: debug
-
-int kjournald(void *arg)
+static int kjournald(void *arg)
 {
 	journal_t *journal = (journal_t *) arg;
 	transaction_t *transaction;
 	struct timer_list timer;
 
-	current_journal = journal;
-
 	daemonize("kjournald");
 
 	/* Set up an interval timer which can be used to trigger a
@@ -193,6 +179,8 @@ loop:
 		if (transaction && time_after_eq(jiffies,
 						transaction->t_expires))
 			should_sleep = 0;
+		if (journal->j_flags & JFS_UNMOUNT)
+ 			should_sleep = 0;
 		if (should_sleep) {
 			spin_unlock(&journal->j_state_lock);
 			schedule();
@@ -969,7 +957,7 @@ void journal_update_superblock(journal_t *journal, int wait)
 	if (wait)
 		sync_dirty_buffer(bh);
 	else
-		ll_rw_block(WRITE, 1, &bh);
+		ll_rw_block(SWRITE, 1, &bh);
 
 out:
 	/* If we have just flushed the log (by marking s_start==0), then
@@ -1439,7 +1427,7 @@ int journal_wipe(journal_t *journal, int write)
  * device this journal is present.
  */
 
-const char *journal_dev_name(journal_t *journal, char *buffer)
+static const char *journal_dev_name(journal_t *journal, char *buffer)
 {
 	struct block_device *bdev;
 
@@ -1485,7 +1473,7 @@ void __journal_abort_hard(journal_t *journal)
 
 /* Soft abort: record the abort error status in the journal superblock,
  * but don't do any other IO. */
-void __journal_abort_soft (journal_t *journal, int errno)
+static void __journal_abort_soft (journal_t *journal, int errno)
 {
 	if (journal->j_flags & JFS_ABORT)
 		return;
@@ -1880,7 +1868,7 @@ EXPORT_SYMBOL(journal_enable_debug);
 
 static struct proc_dir_entry *proc_jbd_debug;
 
-int read_jbd_debug(char *page, char **start, off_t off,
+static int read_jbd_debug(char *page, char **start, off_t off,
 			  int count, int *eof, void *data)
 {
 	int ret;
@@ -1890,7 +1878,7 @@ int read_jbd_debug(char *page, char **start, off_t off,
 	return ret;
 }
 
-int write_jbd_debug(struct file *file, const char __user *buffer,
+static int write_jbd_debug(struct file *file, const char __user *buffer,
 			   unsigned long count, void *data)
 {
 	char buf[32];
@@ -1979,6 +1967,14 @@ static int __init journal_init(void)
 {
 	int ret;
 
+/* Static check for data structure consistency.  There's no code
+ * invoked --- we'll just get a linker failure if things aren't right.
+ */
+	extern void journal_bad_superblock_size(void);
+	if (sizeof(struct journal_superblock_s) != 1024)
+		journal_bad_superblock_size();
+
+
 	ret = journal_init_caches();
 	if (ret != 0)
 		journal_destroy_caches();
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index d327a598f86..a5614418346 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -116,7 +116,8 @@ static inline int hash(journal_t *journal, unsigned long block)
 		(block << (hash_shift - 12))) & (table->hash_size - 1);
 }
 
-int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
+static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+			      tid_t seq)
 {
 	struct list_head *hash_list;
 	struct jbd_revoke_record_s *record;
@@ -613,7 +614,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block(WRITE, 1, &bh);
+	ll_rw_block(SWRITE, 1, &bh);
 }
 #endif
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 77b7662b840..c6ec66fd876 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -490,23 +490,21 @@ void journal_unlock_updates (journal_t *journal)
  */
 static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-	struct buffer_head *bh = jh2bh(jh);
 	int jlist;
 
-	if (buffer_dirty(bh)) {
-		/* If this buffer is one which might reasonably be dirty
-		 * --- ie. data, or not part of this journal --- then
-		 * we're OK to leave it alone, but otherwise we need to
-		 * move the dirty bit to the journal's own internal
-		 * JBDDirty bit. */
-		jlist = jh->b_jlist;
-
-		if (jlist == BJ_Metadata || jlist == BJ_Reserved || 
-		    jlist == BJ_Shadow || jlist == BJ_Forget) {
-			if (test_clear_buffer_dirty(jh2bh(jh))) {
-				set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
-			}
-		}
+	/* If this buffer is one which might reasonably be dirty
+	 * --- ie. data, or not part of this journal --- then
+	 * we're OK to leave it alone, but otherwise we need to
+	 * move the dirty bit to the journal's own internal
+	 * JBDDirty bit. */
+	jlist = jh->b_jlist;
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		struct buffer_head *bh = jh2bh(jh);
+
+		if (test_clear_buffer_dirty(bh))
+			set_buffer_jbddirty(bh);
 	}
 }
 
@@ -574,9 +572,14 @@ repeat:
 			if (jh->b_next_transaction)
 				J_ASSERT_JH(jh, jh->b_next_transaction ==
 							transaction);
-			JBUFFER_TRACE(jh, "Unexpected dirty buffer");
-			jbd_unexpected_dirty_buffer(jh);
- 		}
+		}
+		/*
+		 * In any case we need to clean the dirty flag and we must
+		 * do it under the buffer lock to be sure we don't race
+		 * with running write-out.
+		 */
+		JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+		jbd_unexpected_dirty_buffer(jh);
  	}
 
 	unlock_buffer(bh);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index bfbeb4c86e0..3dcc6d2162c 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1629,9 +1629,6 @@ static int jffs_fsync(struct file *f, struct dentry *d, int datasync)
 }
 
 
-extern int generic_file_open(struct inode *, struct file *) __attribute__((weak));
-extern loff_t generic_file_llseek(struct file *, loff_t, int) __attribute__((weak));
-
 static struct file_operations jffs_file_operations =
 {
 	.open		= generic_file_open,
@@ -1747,6 +1744,7 @@ jffs_delete_inode(struct inode *inode)
 	D3(printk("jffs_delete_inode(): inode->i_ino == %lu\n",
 		  inode->i_ino));
 
+	truncate_inode_pages(&inode->i_data, 0);
 	lock_kernel();
 	inode->i_size = 0;
 	inode->i_blocks = 0;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bd9ed9b0247..8279bf0133f 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -21,9 +21,6 @@
 #include <linux/jffs2.h>
 #include "nodelist.h"
 
-extern int generic_file_open(struct inode *, struct file *) __attribute__((weak));
-extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) __attribute__((weak));
-
 static int jffs2_commit_write (struct file *filp, struct page *pg,
 			       unsigned start, unsigned end);
 static int jffs2_prepare_write (struct file *filp, struct page *pg,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 767c7ecb429..cff352f4ec1 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -132,6 +132,8 @@ void jfs_delete_inode(struct inode *inode)
 	    (JFS_IP(inode)->fileset != cpu_to_le32(FILESYSTEM_I)))
 			return;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (test_cflag(COMMIT_Freewmap, inode))
 		jfs_free_zero_link(inode);
 
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 86ccac80f0a..72a5588faec 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -37,6 +37,9 @@
 #define JFS_ERR_CONTINUE   0x00000004   /* continue */
 #define JFS_ERR_PANIC      0x00000008   /* panic */
 
+#define	JFS_USRQUOTA	0x00000010
+#define	JFS_GRPQUOTA	0x00000020
+
 /* platform option (conditional compilation) */
 #define JFS_AIX		0x80000000	/* AIX support */
 /*	POSIX name/directory  support */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9ff89720f93..71bc34b96b2 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -23,9 +23,11 @@
 #include <linux/parser.h>
 #include <linux/completion.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 #include <linux/moduleparam.h>
 #include <linux/posix_acl.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -192,7 +194,8 @@ static void jfs_put_super(struct super_block *sb)
 
 enum {
 	Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
-	Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err,
+	Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota
 };
 
 static match_table_t tokens = {
@@ -204,8 +207,8 @@ static match_table_t tokens = {
 	{Opt_errors, "errors=%s"},
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
-	{Opt_ignore, "usrquota"},
-	{Opt_ignore, "grpquota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_err, NULL}
 };
 
@@ -293,6 +296,24 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 			}
 			break;
 		}
+
+#if defined(CONFIG_QUOTA)
+		case Opt_quota:
+		case Opt_usrquota:
+			*flag |= JFS_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			*flag |= JFS_GRPQUOTA;
+			break;
+#else
+		case Opt_usrquota:
+		case Opt_grpquota:
+		case Opt_quota:
+			printk(KERN_ERR
+			       "JFS: quota operations not supported\n");
+			break;
+#endif
+
 		default:
 			printk("jfs: Unrecognized mount option \"%s\" "
 					" or missing value\n", p);
@@ -539,6 +560,26 @@ static int jfs_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb);
+
+	if (sbi->flag & JFS_NOINTEGRITY)
+		seq_puts(seq, ",nointegrity");
+	else
+		seq_puts(seq, ",integrity");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->flag & JFS_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->flag & JFS_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
 static struct super_operations jfs_super_operations = {
 	.alloc_inode	= jfs_alloc_inode,
 	.destroy_inode	= jfs_destroy_inode,
@@ -552,6 +593,7 @@ static struct super_operations jfs_super_operations = {
 	.unlockfs       = jfs_unlockfs,
 	.statfs		= jfs_statfs,
 	.remount_fs	= jfs_remount,
+	.show_options	= jfs_show_options
 };
 
 static struct export_operations jfs_export_operations = {
diff --git a/fs/locks.c b/fs/locks.c
index 11956b6179f..c2c09b4798d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2198,21 +2198,23 @@ void steal_locks(fl_owner_t from)
 {
 	struct files_struct *files = current->files;
 	int i, j;
+	struct fdtable *fdt;
 
 	if (from == files)
 		return;
 
 	lock_kernel();
 	j = 0;
+	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
-		if (i >= files->max_fdset || i >= files->max_fds)
+		if (i >= fdt->max_fdset || i >= fdt->max_fds)
 			break;
-		set = files->open_fds->fds_bits[j++];
+		set = fdt->open_fds->fds_bits[j++];
 		while (set) {
 			if (set & 1) {
-				struct file *file = files->fd[i];
+				struct file *file = fdt->fd[i];
 				if (file)
 					__steal_locks(file, from);
 			}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 3f18c21198d..790cc0d0e97 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,6 +24,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	minix_truncate(inode);
 	minix_free_inode(inode);
diff --git a/fs/namei.c b/fs/namei.c
index 6ec1f0fefc5..21d85f1ac83 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -525,6 +525,22 @@ static inline int __do_follow_link(struct path *path, struct nameidata *nd)
 	return error;
 }
 
+static inline void dput_path(struct path *path, struct nameidata *nd)
+{
+	dput(path->dentry);
+	if (path->mnt != nd->mnt)
+		mntput(path->mnt);
+}
+
+static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+{
+	dput(nd->dentry);
+	if (nd->mnt != path->mnt)
+		mntput(nd->mnt);
+	nd->mnt = path->mnt;
+	nd->dentry = path->dentry;
+}
+
 /*
  * This limits recursive symlink follows to 8, while
  * limiting consecutive symlinks to 40.
@@ -552,9 +568,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 	nd->depth--;
 	return err;
 loop:
-	dput(path->dentry);
-	if (path->mnt != nd->mnt)
-		mntput(path->mnt);
+	dput_path(path, nd);
 	path_release(nd);
 	return err;
 }
@@ -813,13 +827,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
 			err = -ENOTDIR; 
 			if (!inode->i_op)
 				break;
-		} else {
-			dput(nd->dentry);
-			if (nd->mnt != next.mnt)
-				mntput(nd->mnt);
-			nd->mnt = next.mnt;
-			nd->dentry = next.dentry;
-		}
+		} else
+			path_to_nameidata(&next, nd);
 		err = -ENOTDIR; 
 		if (!inode->i_op->lookup)
 			break;
@@ -859,13 +868,8 @@ last_component:
 			if (err)
 				goto return_err;
 			inode = nd->dentry->d_inode;
-		} else {
-			dput(nd->dentry);
-			if (nd->mnt != next.mnt)
-				mntput(nd->mnt);
-			nd->mnt = next.mnt;
-			nd->dentry = next.dentry;
-		}
+		} else
+			path_to_nameidata(&next, nd);
 		err = -ENOENT;
 		if (!inode)
 			break;
@@ -901,9 +905,7 @@ return_reval:
 return_base:
 		return 0;
 out_dput:
-		dput(next.dentry);
-		if (nd->mnt != next.mnt)
-			mntput(next.mnt);
+		dput_path(&next, nd);
 		break;
 	}
 	path_release(nd);
@@ -1314,10 +1316,8 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		return error;
 	DQUOT_INIT(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, dentry->d_name.name);
-		security_inode_post_create(dir, dentry, mode);
-	}
 	return error;
 }
 
@@ -1507,11 +1507,7 @@ do_last:
 	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
 		goto do_link;
 
-	dput(nd->dentry);
-	nd->dentry = path.dentry;
-	if (nd->mnt != path.mnt)
-		mntput(nd->mnt);
-	nd->mnt = path.mnt;
+	path_to_nameidata(&path, nd);
 	error = -EISDIR;
 	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
 		goto exit;
@@ -1522,9 +1518,7 @@ ok:
 	return 0;
 
 exit_dput:
-	dput(path.dentry);
-	if (nd->mnt != path.mnt)
-		mntput(path.mnt);
+	dput_path(&path, nd);
 exit:
 	path_release(nd);
 	return error;
@@ -1639,10 +1633,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 
 	DQUOT_INIT(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, dentry->d_name.name);
-		security_inode_post_mknod(dir, dentry, mode, dev);
-	}
 	return error;
 }
 
@@ -1712,10 +1704,8 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	DQUOT_INIT(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
-	if (!error) {
+	if (!error)
 		fsnotify_mkdir(dir, dentry->d_name.name);
-		security_inode_post_mkdir(dir,dentry, mode);
-	}
 	return error;
 }
 
@@ -1951,10 +1941,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 
 	DQUOT_INIT(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, dentry->d_name.name);
-		security_inode_post_symlink(dir, dentry, oldname);
-	}
 	return error;
 }
 
@@ -2024,10 +2012,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	DQUOT_INIT(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	up(&old_dentry->d_inode->i_sem);
-	if (!error) {
+	if (!error)
 		fsnotify_create(dir, new_dentry->d_name.name);
-		security_inode_post_link(old_dentry, dir, new_dentry);
-	}
 	return error;
 }
 
@@ -2146,11 +2132,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 			d_rehash(new_dentry);
 		dput(new_dentry);
 	}
-	if (!error) {
+	if (!error)
 		d_move(old_dentry,new_dentry);
-		security_inode_post_rename(old_dir, old_dentry,
-					   new_dir, new_dentry);
-	}
 	return error;
 }
 
@@ -2176,7 +2159,6 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 		/* The following d_move() should become unconditional */
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
 			d_move(old_dentry, new_dentry);
-		security_inode_post_rename(old_dir, old_dentry, new_dir, new_dentry);
 	}
 	if (target)
 		up(&target->i_sem);
diff --git a/fs/namespace.c b/fs/namespace.c
index 79bd8a46e1e..34156260c9b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -40,7 +40,7 @@ static inline int sysfs_init(void)
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static struct list_head *mount_hashtable;
-static int hash_mask, hash_bits;
+static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache; 
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
@@ -1334,8 +1334,12 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
 	error = -EINVAL;
 	if (user_nd.mnt->mnt_root != user_nd.dentry)
 		goto out2; /* not a mountpoint */
+	if (user_nd.mnt->mnt_parent == user_nd.mnt)
+		goto out2; /* not attached */
 	if (new_nd.mnt->mnt_root != new_nd.dentry)
 		goto out2; /* not a mountpoint */
+	if (new_nd.mnt->mnt_parent == new_nd.mnt)
+		goto out2; /* not attached */
 	tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
 	spin_lock(&vfsmount_lock);
 	if (tmp != new_nd.mnt) {
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 44795d2f4b3..8c8839203cd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -286,6 +286,8 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 static void
 ncp_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (S_ISDIR(inode->i_mode)) {
 		DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino);
 	}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 541b418327c..6922469d6fc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -146,6 +146,8 @@ nfs_delete_inode(struct inode * inode)
 {
 	dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	nfs_wb_all(inode);
 	/*
 	 * The following should never happen...
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9a11aa39e2e..057aff74550 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/hash.h>
+#include <linux/module.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -221,6 +222,7 @@ static int expkey_show(struct seq_file *m,
 }
 	
 struct cache_detail svc_expkey_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
 	.hash_table	= expkey_table,
 	.name		= "nfsd.fh",
@@ -456,6 +458,7 @@ static int svc_export_show(struct seq_file *m,
 	return 0;
 }
 struct cache_detail svc_export_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= EXPORT_HASHMAX,
 	.hash_table	= export_table,
 	.name		= "nfsd.export",
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5605a26efc5..13369650cdf 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -187,6 +187,7 @@ static int         idtoname_parse(struct cache_detail *, char *, int);
 static struct ent *idtoname_lookup(struct ent *, int);
 
 static struct cache_detail idtoname_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= ENT_HASHMAX,
 	.hash_table	= idtoname_table,
 	.name		= "nfs4.idtoname",
@@ -320,6 +321,7 @@ static struct ent *nametoid_lookup(struct ent *, int);
 static int         nametoid_parse(struct cache_detail *, char *, int);
 
 static struct cache_detail nametoid_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= ENT_HASHMAX,
 	.hash_table	= nametoid_table,
 	.name		= "nfs4.nametoid",
@@ -404,8 +406,10 @@ nfsd_idmap_init(void)
 void
 nfsd_idmap_shutdown(void)
 {
-	cache_unregister(&idtoname_cache);
-	cache_unregister(&nametoid_cache);
+	if (cache_unregister(&idtoname_cache))
+		printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
+	if (cache_unregister(&nametoid_cache))
+		printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
 }
 
 /*
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 57ed50fe7f8..954cf893d50 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -93,7 +93,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 
 	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
 			clname->len, clname->data);
-	tfm = crypto_alloc_tfm("md5", 0);
+	tfm = crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
 	if (tfm == NULL)
 		goto out;
 	cksum.len = crypto_tfm_alg_digestsize(tfm);
@@ -114,8 +114,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 	kfree(cksum.data);
 	status = nfs_ok;
 out:
-	if (tfm)
-		crypto_free_tfm(tfm);
+	crypto_free_tfm(tfm);
 	return status;
 }
 
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 9eecc9939df..e4fd6134244 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -22,6 +22,76 @@ ToDo/Notes:
 	- Enable the code for setting the NT4 compatibility flag when we start
 	  making NTFS 1.2 specific modifications.
 
+2.1.24 - Lots of bug fixes and support more clean journal states.
+
+	- Support journals ($LogFile) which have been modified by chkdsk.  This
+	  means users can boot into Windows after we marked the volume dirty.
+	  The Windows boot will run chkdsk and then reboot.  The user can then
+	  immediately boot into Linux rather than having to do a full Windows
+	  boot first before rebooting into Linux and we will recognize such a
+	  journal and empty it as it is clean by definition.
+	- Support journals ($LogFile) with only one restart page as well as
+	  journals with two different restart pages.  We sanity check both and
+	  either use the only sane one or the more recent one of the two in the
+	  case that both are valid.
+	- Modify fs/ntfs/malloc.h::ntfs_malloc_nofs() to do the kmalloc() based
+	  allocations with __GFP_HIGHMEM, analogous to how the vmalloc() based
+	  allocations are done.
+	- Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
+	  ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
+	  hence cannot fail.
+	- Use ntfs_malloc_nofs_nofail() in the two critical regions in
+	  fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
+	  need to panic() if the allocation fails as it now cannot fail.
+	- Fix two nasty runlist merging bugs that had gone unnoticed so far.
+	  Thanks to Stefano Picerno for the bug report.
+	- Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
+	- Fix handling of valid but empty mapping pairs array in
+	  fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
+	- Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
+	  messages and include the inode number.  Thanks to Yura Pakhuchiy for
+	  pointing this out.
+	- Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
+	  length is zero.
+	- Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
+	  specified hole into a runlist.
+	- Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
+	  index entry is in the index root, we forgot to set the @ir pointer in
+	  the index context.  Thanks to Yura Pakhuchiy for finding this bug.
+	- Remove bogus setting of PageError in ntfs_read_compressed_block().
+	- Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
+	- Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
+	  access to the allocated size in the ntfs inode with the size lock.
+	- Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
+	  return LCN_ENOENT when there is no runlist and the allocated size is
+	  zero.
+	- Fix load_attribute_list() to handle the case of a NULL runlist.
+	- Fix handling of sparse attributes in ntfs_attr_make_non_resident().
+	- Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
+	  to ensure that these functions are never called for compressed or
+	  encrypted attributes.
+	- Fix cluster (de)allocators to work when the runlist is NULL and more
+	  importantly to take a locked runlist rather than them locking it
+	  which leads to lock reversal.
+	- Truncate {a,c,m}time to the ntfs supported time granularity when
+	  updating the times in the inode in ntfs_setattr().
+	- Fixup handling of sparse, compressed, and encrypted attributes in
+	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
+	  fs/ntfs/aops.c::ntfs_{read,write}page().
+	- Make ntfs_write_block() not instantiate sparse blocks if they contain
+	  only zeroes.
+	- Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
+	  lock protection over the buffer submission for i/o which allows the
+	  removal of the get_bh()/put_bh() pairs for each buffer.
+	- Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
+	  where a concurrent truncate has truncated the runlist under our feet.
+	- Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
+	- In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
+	  in the first buffer head instead of a driver global spin lock to
+	  improve scalability.
+	- Minor fix to error handling and error message display in
+	  fs/ntfs/aops.c::ntfs_prepare_nonresident_write(). 
+
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
 
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index f083f27d8b6..894b2b876d3 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.23\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 78adad7a988..545236414d5 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -55,9 +55,8 @@
  */
 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
-	static DEFINE_SPINLOCK(page_uptodate_lock);
 	unsigned long flags;
-	struct buffer_head *tmp;
+	struct buffer_head *first, *tmp;
 	struct page *page;
 	ntfs_inode *ni;
 	int page_uptodate = 1;
@@ -89,11 +88,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 	} else {
 		clear_buffer_uptodate(bh);
+		SetPageError(page);
 		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
 				(unsigned long long)bh->b_blocknr);
-		SetPageError(page);
 	}
-	spin_lock_irqsave(&page_uptodate_lock, flags);
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -108,7 +109,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -141,7 +143,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	return;
 }
 
@@ -185,13 +188,15 @@ static int ntfs_read_block(struct page *page)
 	blocksize_bits = VFS_I(ni)->i_blkbits;
 	blocksize = 1 << blocksize_bits;
 
-	if (!page_has_buffers(page))
+	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize, 0);
-	bh = head = page_buffers(page);
-	if (unlikely(!bh)) {
-		unlock_page(page);
-		return -ENOMEM;
+		if (unlikely(!page_has_buffers(page))) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
 	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
 
 	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
 	read_lock_irqsave(&ni->size_lock, flags);
@@ -204,6 +209,7 @@ static int ntfs_read_block(struct page *page)
 	nr = i = 0;
 	do {
 		u8 *kaddr;
+		int err;
 
 		if (unlikely(buffer_uptodate(bh)))
 			continue;
@@ -211,6 +217,7 @@ static int ntfs_read_block(struct page *page)
 			arr[nr++] = bh;
 			continue;
 		}
+		err = 0;
 		bh->b_bdev = vol->sb->s_bdev;
 		/* Is the block within the allowed limits? */
 		if (iblock < lblock) {
@@ -252,7 +259,6 @@ lock_retry_remap:
 				goto handle_hole;
 			/* If first try and runlist unmapped, map and retry. */
 			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-				int err;
 				is_retry = TRUE;
 				/*
 				 * Attempt to map runlist, dropping lock for
@@ -263,20 +269,30 @@ lock_retry_remap:
 				if (likely(!err))
 					goto lock_retry_remap;
 				rl = NULL;
-				lcn = err;
 			} else if (!rl)
 				up_read(&ni->runlist.lock);
+			/*
+			 * If buffer is outside the runlist, treat it as a
+			 * hole.  This can happen due to concurrent truncate
+			 * for example.
+			 */
+			if (err == -ENOENT || lcn == LCN_ENOENT) {
+				err = 0;
+				goto handle_hole;
+			}
 			/* Hard error, zero out region. */
+			if (!err)
+				err = -EIO;
 			bh->b_blocknr = -1;
 			SetPageError(page);
 			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
 					"attribute type 0x%x, vcn 0x%llx, "
 					"offset 0x%x because its location on "
 					"disk could not be determined%s "
-					"(error code %lli).", ni->mft_no,
+					"(error code %i).", ni->mft_no,
 					ni->type, (unsigned long long)vcn,
 					vcn_ofs, is_retry ? " even after "
-					"retrying" : "", (long long)lcn);
+					"retrying" : "", err);
 		}
 		/*
 		 * Either iblock was outside lblock limits or
@@ -289,9 +305,10 @@ handle_hole:
 handle_zblock:
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + i * blocksize, 0, blocksize);
-		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
-		set_buffer_uptodate(bh);
+		flush_dcache_page(page);
+		if (likely(!err))
+			set_buffer_uptodate(bh);
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
 
 	/* Release the lock if we took it. */
@@ -367,31 +384,38 @@ retry_readpage:
 		return 0;
 	}
 	ni = NTFS_I(page->mapping->host);
-
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.
+	 */
+	if (ni->type != AT_INDEX_ROOT) {
+		/* If attribute is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			err = -EACCES;
+			goto err_out;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			return ntfs_read_compressed_block(page);
+		}
+	}
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed or
-		 * encrypted.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				err = -EACCES;
-				goto err_out;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni))
-				return ntfs_read_compressed_block(page);
-		}
-		/* Normal data stream. */
+		/* Normal, non-resident data stream. */
 		return ntfs_read_block(page);
 	}
 	/*
 	 * Attribute is resident, implying it is not compressed or encrypted.
 	 * This also means the attribute is smaller than an mft record and
 	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.
+	 * index above 0.  Note the attribute can actually be marked compressed
+	 * but if it is resident the actual data is not compressed so we are
+	 * ok to ignore the compressed flag here.
 	 */
 	if (unlikely(page->index > 0)) {
 		kaddr = kmap_atomic(page, KM_USER0);
@@ -511,19 +535,21 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 		BUG_ON(!PageUptodate(page));
 		create_empty_buffers(page, blocksize,
 				(1 << BH_Uptodate) | (1 << BH_Dirty));
+		if (unlikely(!page_has_buffers(page))) {
+			ntfs_warning(vol->sb, "Error allocating page "
+					"buffers.  Redirtying page so we try "
+					"again later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but leave
+			 * its buffers' dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
 	}
 	bh = head = page_buffers(page);
-	if (unlikely(!bh)) {
-		ntfs_warning(vol->sb, "Error allocating page buffers. "
-				"Redirtying page so we try again later.");
-		/*
-		 * Put the page back on mapping->dirty_pages, but leave its
-		 * buffer's dirty state as-is.
-		 */
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
-	}
+	BUG_ON(!bh);
 
 	/* NOTE: Different naming scheme to ntfs_read_block()! */
 
@@ -670,6 +696,27 @@ lock_retry_remap:
 		}
 		/* It is a hole, need to instantiate it. */
 		if (lcn == LCN_HOLE) {
+			u8 *kaddr;
+			unsigned long *bpos, *bend;
+
+			/* Check if the buffer is zero. */
+			kaddr = kmap_atomic(page, KM_USER0);
+			bpos = (unsigned long *)(kaddr + bh_offset(bh));
+			bend = (unsigned long *)((u8*)bpos + blocksize);
+			do {
+				if (unlikely(*bpos))
+					break;
+			} while (likely(++bpos < bend));
+			kunmap_atomic(kaddr, KM_USER0);
+			if (bpos == bend) {
+				/*
+				 * Buffer is zero and sparse, no need to write
+				 * it.
+				 */
+				bh->b_blocknr = -1;
+				clear_buffer_dirty(bh);
+				continue;
+			}
 			// TODO: Instantiate the hole.
 			// clear_buffer_new(bh);
 			// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
@@ -690,20 +737,37 @@ lock_retry_remap:
 			if (likely(!err))
 				goto lock_retry_remap;
 			rl = NULL;
-			lcn = err;
 		} else if (!rl)
 			up_read(&ni->runlist.lock);
+		/*
+		 * If buffer is outside the runlist, truncate has cut it out
+		 * of the runlist.  Just clean and clear the buffer and set it
+		 * uptodate so it can get discarded by the VM.
+		 */
+		if (err == -ENOENT || lcn == LCN_ENOENT) {
+			u8 *kaddr;
+
+			bh->b_blocknr = -1;
+			clear_buffer_dirty(bh);
+			kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr + bh_offset(bh), 0, blocksize);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page);
+			set_buffer_uptodate(bh);
+			err = 0;
+			continue;
+		}
 		/* Failed to map the buffer, even after retrying. */
+		if (!err)
+			err = -EIO;
 		bh->b_blocknr = -1;
 		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
 				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
 				"because its location on disk could not be "
-				"determined%s (error code %lli).", ni->mft_no,
+				"determined%s (error code %i).", ni->mft_no,
 				ni->type, (unsigned long long)vcn,
 				vcn_ofs, is_retry ? " even after "
-				"retrying" : "", (long long)lcn);
-		if (!err)
-			err = -EIO;
+				"retrying" : "", err);
 		break;
 	} while (block++, (bh = bh->b_this_page) != head);
 
@@ -714,7 +778,7 @@ lock_retry_remap:
 	/* For the error case, need to reset bh to the beginning. */
 	bh = head;
 
-	/* Just an optimization, so ->readpage() isn't called later. */
+	/* Just an optimization, so ->readpage() is not called later. */
 	if (unlikely(!PageUptodate(page))) {
 		int uptodate = 1;
 		do {
@@ -730,7 +794,6 @@ lock_retry_remap:
 
 	/* Setup all mapped, dirty buffers for async write i/o. */
 	do {
-		get_bh(bh);
 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
@@ -768,14 +831,8 @@ lock_retry_remap:
 
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
-	unlock_page(page);
 
-	/*
-	 * Submit the prepared buffers for i/o. Note the page is unlocked,
-	 * and the async write i/o completion handler can end_page_writeback()
-	 * at any time after the *first* submit_bh(). So the buffers can then
-	 * disappear...
-	 */
+	/* Submit the prepared buffers for i/o. */
 	need_end_writeback = TRUE;
 	do {
 		struct buffer_head *next = bh->b_this_page;
@@ -783,9 +840,9 @@ lock_retry_remap:
 			submit_bh(WRITE, bh);
 			need_end_writeback = FALSE;
 		}
-		put_bh(bh);
 		bh = next;
 	} while (bh != head);
+	unlock_page(page);
 
 	/* If no i/o was started, need to end_page_writeback(). */
 	if (unlikely(need_end_writeback))
@@ -860,7 +917,6 @@ static int ntfs_write_mst_block(struct page *page,
 	sync = (wbc->sync_mode == WB_SYNC_ALL);
 
 	/* Make sure we have mapped buffers. */
-	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 
@@ -1280,38 +1336,42 @@ retry_writepage:
 		ntfs_debug("Write outside i_size - truncated?");
 		return 0;
 	}
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.
+	 */
+	if (ni->type != AT_INDEX_ROOT) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			unlock_page(page);
+			BUG_ON(ni->type != AT_DATA);
+			ntfs_debug("Denying write access to encrypted "
+					"file.");
+			return -EACCES;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			// TODO: Implement and replace this with
+			// return ntfs_write_compressed_block(page);
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+		// TODO: Implement and remove this check.
+		if (NInoNonResident(ni) && NInoSparse(ni)) {
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to sparse files is not "
+					"supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed, encrypted,
-		 * and/or sparse.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				unlock_page(page);
-				ntfs_debug("Denying write access to encrypted "
-						"file.");
-				return -EACCES;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni)) {
-				// TODO: Implement and replace this check with
-				// return ntfs_write_compressed_block(page);
-				unlock_page(page);
-				ntfs_error(vi->i_sb, "Writing to compressed "
-						"files is not supported yet. "
-						"Sorry.");
-				return -EOPNOTSUPP;
-			}
-			// TODO: Implement and remove this check.
-			if (NInoSparse(ni)) {
-				unlock_page(page);
-				ntfs_error(vi->i_sb, "Writing to sparse files "
-						"is not supported yet. Sorry.");
-				return -EOPNOTSUPP;
-			}
-		}
 		/* We have to zero every time due to mmap-at-end-of-file. */
 		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
 			/* The page straddles i_size. */
@@ -1324,14 +1384,16 @@ retry_writepage:
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
 			return ntfs_write_mst_block(page, wbc);
-		/* Normal data stream. */
+		/* Normal, non-resident data stream. */
 		return ntfs_write_block(page, wbc);
 	}
 	/*
-	 * Attribute is resident, implying it is not compressed, encrypted,
-	 * sparse, or mst protected.  This also means the attribute is smaller
-	 * than an mft record and hence smaller than a page, so can simply
-	 * return error on any pages with index above 0.
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * mst protected.  This also means the attribute is smaller than an mft
+	 * record and hence smaller than a page, so can simply return error on
+	 * any pages with index above 0.  Note the attribute can actually be
+	 * marked compressed but if it is resident the actual data is not
+	 * compressed so we are ok to ignore the compressed flag here.
 	 */
 	BUG_ON(page_has_buffers(page));
 	BUG_ON(!PageUptodate(page));
@@ -1380,30 +1442,14 @@ retry_writepage:
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
-
 	/*
-	 * Here, we don't need to zero the out of bounds area everytime because
-	 * the below memcpy() already takes care of the mmap-at-end-of-file
-	 * requirements. If the file is converted to a non-resident one, then
-	 * the code path use is switched to the non-resident one where the
-	 * zeroing happens on each ntfs_writepage() invocation.
-	 *
-	 * The above also applies nicely when i_size is decreased.
-	 *
-	 * When i_size is increased, the memory between the old and new i_size
-	 * _must_ be zeroed (or overwritten with new data). Otherwise we will
-	 * expose data to userspace/disk which should never have been exposed.
-	 *
-	 * FIXME: Ensure that i_size increases do the zeroing/overwriting and
-	 * if we cannot guarantee that, then enable the zeroing below.  If the
-	 * zeroing below is enabled, we MUST move the unlock_page() from above
-	 * to after the kunmap_atomic(), i.e. just before the
-	 * end_page_writeback().
-	 * UPDATE: ntfs_prepare/commit_write() do the zeroing on i_size
-	 * increases for resident attributes so those are ok.
-	 * TODO: ntfs_truncate(), others?
+	 * Here, we do not need to zero the out of bounds area everytime
+	 * because the below memcpy() already takes care of the
+	 * mmap-at-end-of-file requirements.  If the file is converted to a
+	 * non-resident one, then the code path use is switched to the
+	 * non-resident one where the zeroing happens on each ntfs_writepage()
+	 * invocation.
 	 */
-
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
 	i_size = i_size_read(vi);
 	if (unlikely(attr_len > i_size)) {
@@ -1681,27 +1727,25 @@ lock_retry_remap:
 					if (likely(!err))
 						goto lock_retry_remap;
 					rl = NULL;
-					lcn = err;
 				} else if (!rl)
 					up_read(&ni->runlist.lock);
 				/*
 				 * Failed to map the buffer, even after
 				 * retrying.
 				 */
+				if (!err)
+					err = -EIO;
 				bh->b_blocknr = -1;
 				ntfs_error(vol->sb, "Failed to write to inode "
 						"0x%lx, attribute type 0x%x, "
 						"vcn 0x%llx, offset 0x%x "
 						"because its location on disk "
 						"could not be determined%s "
-						"(error code %lli).",
+						"(error code %i).",
 						ni->mft_no, ni->type,
 						(unsigned long long)vcn,
 						vcn_ofs, is_retry ? " even "
-						"after retrying" : "",
-						(long long)lcn);
-				if (!err)
-					err = -EIO;
+						"after retrying" : "", err);
 				goto err_out;
 			}
 			/* We now have a successful remap, i.e. lcn >= 0. */
@@ -2357,6 +2401,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 			buffers_to_free = bh;
 	}
 	bh = head = page_buffers(page);
+	BUG_ON(!bh);
 	do {
 		bh_ofs = bh_offset(bh);
 		if (bh_ofs + bh_size <= ofs)
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index cd0f9e740b1..3f9a4ff42ee 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -43,6 +43,9 @@
  * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
  * of bounds of the runlist.
  *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
  * Locking: - The runlist must be locked for writing.
  *	    - This function modifies the runlist.
  */
@@ -54,6 +57,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
 	ATTR_RECORD *a;
 	ntfs_attr_search_ctx *ctx;
 	runlist_element *rl;
+	unsigned long flags;
 	int err = 0;
 
 	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
@@ -85,8 +89,11 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
 	 * ntfs_mapping_pairs_decompress() fails.
 	 */
 	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-	if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1))
+	if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) {
+		read_lock_irqsave(&ni->size_lock, flags);
 		end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
 	if (unlikely(vcn >= end_vcn)) {
 		err = -ENOENT;
 		goto err_out;
@@ -165,6 +172,7 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
 		const BOOL write_locked)
 {
 	LCN lcn;
+	unsigned long flags;
 	BOOL is_retry = FALSE;
 
 	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
@@ -173,6 +181,14 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
 	BUG_ON(!ni);
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return LCN_ENOENT;
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
 retry_remap:
 	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
 	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
@@ -255,6 +271,7 @@ retry_remap:
 runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
 		const BOOL write_locked)
 {
+	unsigned long flags;
 	runlist_element *rl;
 	int err = 0;
 	BOOL is_retry = FALSE;
@@ -265,6 +282,14 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
 	BUG_ON(!ni);
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return ERR_PTR(-ENOENT);
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
 retry_remap:
 	rl = ni->runlist.rl;
 	if (likely(rl && vcn >= rl[0].vcn)) {
@@ -528,6 +553,11 @@ int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
 	block_size_bits = sb->s_blocksize_bits;
 	down_read(&runlist->lock);
 	rl = runlist->rl;
+	if (!rl) {
+		ntfs_error(sb, "Cannot read attribute list since runlist is "
+				"missing.");
+		goto err_out;	
+	}
 	/* Read all clusters specified by the runlist one run at a time. */
 	while (rl->length) {
 		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
@@ -1247,6 +1277,46 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
 }
 
 /**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m:		mft record containing attribute record
+ * @a:		attribute record whose value to resize
+ * @new_size:	new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size)
+{
+	u32 old_size;
+
+	/* Resize the resident part of the attribute record. */
+	if (ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) + new_size))
+		return -ENOSPC;
+	/*
+	 * The resize succeeded!  If we made the attribute value bigger, clear
+	 * the area between the old size and @new_size.
+	 */
+	old_size = le32_to_cpu(a->data.resident.value_length);
+	if (new_size > old_size)
+		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+				old_size, 0, new_size - old_size);
+	/* Finally update the length of the attribute value. */
+	a->data.resident.value_length = cpu_to_le32(new_size);
+	return 0;
+}
+
+/**
  * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
  * @ni:		ntfs inode describing the attribute to convert
  *
@@ -1302,6 +1372,12 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 		return err;
 	}
 	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
+	/*
 	 * The size needs to be aligned to a cluster boundary for allocation
 	 * purposes.
 	 */
@@ -1377,10 +1453,15 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 	BUG_ON(a->non_resident);
 	/*
 	 * Calculate new offsets for the name and the mapping pairs array.
-	 * We assume the attribute is not compressed or sparse.
 	 */
-	name_ofs = (offsetof(ATTR_REC,
-			data.non_resident.compressed_size) + 7) & ~7;
+	if (NInoSparse(ni) || NInoCompressed(ni))
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) +
+				sizeof(a->data.non_resident.compressed_size) +
+				7) & ~7;
+	else
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) + 7) & ~7;
 	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
 	/*
 	 * Determine the size of the resident part of the now non-resident
@@ -1419,24 +1500,23 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
 				a->name_length * sizeof(ntfschar));
 	a->name_offset = cpu_to_le16(name_ofs);
-	/*
-	 * FIXME: For now just clear all of these as we do not support them
-	 * when writing.
-	 */
-	a->flags &= cpu_to_le16(0xffff & ~le16_to_cpu(ATTR_IS_SPARSE |
-			ATTR_IS_ENCRYPTED | ATTR_COMPRESSION_MASK));
 	/* Setup the fields specific to non-resident attributes. */
 	a->data.non_resident.lowest_vcn = 0;
 	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
 			vol->cluster_size_bits);
 	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
-	a->data.non_resident.compression_unit = 0;
 	memset(&a->data.non_resident.reserved, 0,
 			sizeof(a->data.non_resident.reserved));
 	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
 	a->data.non_resident.data_size =
 			a->data.non_resident.initialized_size =
 			cpu_to_sle64(attr_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		a->data.non_resident.compression_unit = 4;
+		a->data.non_resident.compressed_size =
+				a->data.non_resident.allocated_size;
+	} else
+		a->data.non_resident.compression_unit = 0;
 	/* Generate the mapping pairs array into the attribute record. */
 	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
 			arec_size - mp_ofs, rl, 0, -1, NULL);
@@ -1446,16 +1526,19 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 		goto undo_err_out;
 	}
 	/* Setup the in-memory attribute structure to be non-resident. */
-	/*
-	 * FIXME: For now just clear all of these as we do not support them
-	 * when writing.
-	 */
-	NInoClearSparse(ni);
-	NInoClearEncrypted(ni);
-	NInoClearCompressed(ni);
 	ni->runlist.rl = rl;
 	write_lock_irqsave(&ni->size_lock, flags);
 	ni->allocated_size = new_size;
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size = ni->allocated_size;
+		ni->itype.compressed.block_size = 1U <<
+				(a->data.non_resident.compression_unit +
+				vol->cluster_size_bits);
+		ni->itype.compressed.block_size_bits =
+				ffs(ni->itype.compressed.block_size) - 1;
+		ni->itype.compressed.block_clusters = 1U <<
+				a->data.non_resident.compression_unit;
+	}
 	write_unlock_irqrestore(&ni->size_lock, flags);
 	/*
 	 * This needs to be last since the address space operations ->readpage
@@ -1603,6 +1686,12 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	BUG_ON(cnt < 0);
 	if (!cnt)
 		goto done;
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
 	mapping = VFS_I(ni)->i_mapping;
 	/* Work out the starting index and page offset. */
 	idx = ofs >> PAGE_CACHE_SHIFT;
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 0e4ac6d3c0e..0618ed6fd7b 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -99,6 +99,8 @@ extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
 		const ATTR_TYPE type);
 
 extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size);
 
 extern int ntfs_attr_make_non_resident(ntfs_inode *ni);
 
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6d265cfd49a..25d24106f89 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -539,7 +539,6 @@ int ntfs_read_compressed_block(struct page *page)
 	if (unlikely(!pages || !bhs)) {
 		kfree(bhs);
 		kfree(pages);
-		SetPageError(page);
 		unlock_page(page);
 		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
 		return -ENOMEM;
@@ -871,9 +870,6 @@ lock_retry_remap:
 			for (; prev_cur_page < cur_page; prev_cur_page++) {
 				page = pages[prev_cur_page];
 				if (page) {
-					if (prev_cur_page == xpage &&
-							!xpage_done)
-						SetPageError(page);
 					flush_dcache_page(page);
 					kunmap(page);
 					unlock_page(page);
@@ -904,8 +900,6 @@ lock_retry_remap:
 					"Terminating them with extreme "
 					"prejudice.  Inode 0x%lx, page index "
 					"0x%lx.", ni->mft_no, page->index);
-			if (cur_page == xpage && !xpage_done)
-				SetPageError(page);
 			flush_dcache_page(page);
 			kunmap(page);
 			unlock_page(page);
@@ -953,8 +947,6 @@ err_out:
 	for (i = cur_page; i < max_page; i++) {
 		page = pages[i];
 		if (page) {
-			if (i == xpage && !xpage_done)
-				SetPageError(page);
 			flush_dcache_page(page);
 			kunmap(page);
 			unlock_page(page);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 46779471c54..795c3d1930f 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1051,7 +1051,8 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 			ie->key.file_name.file_name_length, &name,
 			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
 	if (name_len <= 0) {
-		ntfs_debug("Skipping unrepresentable file.");
+		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+				(long long)MREF_LE(ie->data.dir.indexed_file));
 		return 0;
 	}
 	if (ie->key.file_name.file_attributes &
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index e0f530ce6b9..be9fd1dd423 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
- * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
+ * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -94,6 +94,11 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
 		ret = ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
+	/*
+	 * NOTE: If we were to use mapping->private_list (see ext2 and
+	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
+	 * sync_mapping_buffers(vi->i_mapping).
+	 */
 	err = sync_blockdev(vi->i_sb->s_bdev);
 	if (unlikely(err && !ret))
 		ret = err;
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 11fd5307d78..8f2d5727546 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -205,6 +205,7 @@ int ntfs_index_lookup(const void *key, const int key_len,
 				&ie->key, key_len)) {
 ir_done:
 			ictx->is_in_root = TRUE;
+			ictx->ir = ir;
 			ictx->actx = actx;
 			ictx->base_ni = base_ni;
 			ictx->ia = NULL;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 886214a77f9..dc4bbe3acf5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1013,41 +1013,50 @@ skip_large_dir_stuff:
 		}
 		a = ctx->attr;
 		/* Setup the state. */
-		if (a->non_resident) {
-			NInoSetNonResident(ni);
-			if (a->flags & (ATTR_COMPRESSION_MASK |
-					ATTR_IS_SPARSE)) {
-				if (a->flags & ATTR_COMPRESSION_MASK) {
-					NInoSetCompressed(ni);
-					if (vol->cluster_size > 4096) {
-						ntfs_error(vi->i_sb, "Found "
+		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
+				NInoSetCompressed(ni);
+				if (vol->cluster_size > 4096) {
+					ntfs_error(vi->i_sb, "Found "
 							"compressed data but "
 							"compression is "
 							"disabled due to "
 							"cluster size (%i) > "
 							"4kiB.",
 							vol->cluster_size);
-						goto unm_err_out;
-					}
-					if ((a->flags & ATTR_COMPRESSION_MASK)
-							!= ATTR_IS_COMPRESSED) {
-						ntfs_error(vi->i_sb, "Found "
-							"unknown compression "
-							"method or corrupt "
-							"file.");
-						goto unm_err_out;
-					}
+					goto unm_err_out;
+				}
+				if ((a->flags & ATTR_COMPRESSION_MASK)
+						!= ATTR_IS_COMPRESSED) {
+					ntfs_error(vi->i_sb, "Found unknown "
+							"compression method "
+							"or corrupt file.");
+					goto unm_err_out;
 				}
-				if (a->flags & ATTR_IS_SPARSE)
-					NInoSetSparse(ni);
+			}
+			if (a->flags & ATTR_IS_SPARSE)
+				NInoSetSparse(ni);
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (NInoCompressed(ni)) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed data.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->non_resident) {
+			NInoSetNonResident(ni);
+			if (NInoCompressed(ni) || NInoSparse(ni)) {
 				if (a->data.non_resident.compression_unit !=
 						4) {
 					ntfs_error(vi->i_sb, "Found "
-						"nonstandard compression unit "
-						"(%u instead of 4).  Cannot "
-						"handle this.",
-						a->data.non_resident.
-						compression_unit);
+							"nonstandard "
+							"compression unit (%u "
+							"instead of 4).  "
+							"Cannot handle this.",
+							a->data.non_resident.
+							compression_unit);
 					err = -EOPNOTSUPP;
 					goto unm_err_out;
 				}
@@ -1065,14 +1074,6 @@ skip_large_dir_stuff:
 						a->data.non_resident.
 						compressed_size);
 			}
-			if (a->flags & ATTR_IS_ENCRYPTED) {
-				if (a->flags & ATTR_COMPRESSION_MASK) {
-					ntfs_error(vi->i_sb, "Found encrypted "
-							"and compressed data.");
-					goto unm_err_out;
-				}
-				NInoSetEncrypted(ni);
-			}
 			if (a->data.non_resident.lowest_vcn) {
 				ntfs_error(vi->i_sb, "First extent of $DATA "
 						"attribute has non zero "
@@ -1212,6 +1213,75 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	if (unlikely(err))
 		goto unm_err_out;
 	a = ctx->attr;
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
+			NInoSetCompressed(ni);
+			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+					ni->name_len)) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"non-data or named data "
+						"attribute.  Please report "
+						"you saw this message to "
+						"linux-ntfs-dev@lists."
+						"sourceforge.net");
+				goto unm_err_out;
+			}
+			if (vol->cluster_size > 4096) {
+				ntfs_error(vi->i_sb, "Found compressed "
+						"attribute but compression is "
+						"disabled due to cluster size "
+						"(%i) > 4kiB.",
+						vol->cluster_size);
+				goto unm_err_out;
+			}
+			if ((a->flags & ATTR_COMPRESSION_MASK) !=
+					ATTR_IS_COMPRESSED) {
+				ntfs_error(vi->i_sb, "Found unknown "
+						"compression method.");
+				goto unm_err_out;
+			}
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * compress all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is %s.  Please "
+					"report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net",
+					NInoCompressed(ni) ? "compressed" :
+					"sparse");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		if (NInoCompressed(ni)) {
+			ntfs_error(vi->i_sb, "Found encrypted and compressed "
+					"data.");
+			goto unm_err_out;
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * encrypt all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is encrypted.  "
+					"Please report you saw this message "
+					"to linux-ntfs-dev@lists.sourceforge."
+					"net");
+			goto unm_err_out;
+		}
+		if (ni->type != AT_DATA) {
+			ntfs_error(vi->i_sb, "Found encrypted non-data "
+					"attribute.");
+			goto unm_err_out;
+		}
+		NInoSetEncrypted(ni);
+	}
 	if (!a->non_resident) {
 		/* Ensure the attribute name is placed before the value. */
 		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
@@ -1220,11 +1290,10 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 					"the attribute value.");
 			goto unm_err_out;
 		}
-		if (NInoMstProtected(ni) || a->flags) {
+		if (NInoMstProtected(ni)) {
 			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"or attribute with non-zero flags but "
-					"the attribute is resident.  Please "
-					"report you saw this message to "
+					"but the attribute is resident.  "
+					"Please report you saw this message to "
 					"linux-ntfs-dev@lists.sourceforge.net");
 			goto unm_err_out;
 		}
@@ -1250,50 +1319,8 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 					"the mapping pairs array.");
 			goto unm_err_out;
 		}
-		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				NInoSetCompressed(ni);
-				if ((ni->type != AT_DATA) || (ni->type ==
-						AT_DATA && ni->name_len)) {
-					ntfs_error(vi->i_sb, "Found compressed "
-							"non-data or named "
-							"data attribute.  "
-							"Please report you "
-							"saw this message to "
-							"linux-ntfs-dev@lists."
-							"sourceforge.net");
-					goto unm_err_out;
-				}
-				if (vol->cluster_size > 4096) {
-					ntfs_error(vi->i_sb, "Found compressed "
-							"attribute but "
-							"compression is "
-							"disabled due to "
-							"cluster size (%i) > "
-							"4kiB.",
-							vol->cluster_size);
-					goto unm_err_out;
-				}
-				if ((a->flags & ATTR_COMPRESSION_MASK) !=
-						ATTR_IS_COMPRESSED) {
-					ntfs_error(vi->i_sb, "Found unknown "
-							"compression method.");
-					goto unm_err_out;
-				}
-			}
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is %s.  Please report you "
-						"saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net",
-						NInoCompressed(ni) ?
-						"compressed" : "sparse");
-				goto unm_err_out;
-			}
-			if (a->flags & ATTR_IS_SPARSE)
-				NInoSetSparse(ni);
+		if ((NInoCompressed(ni) || NInoSparse(ni)) &&
+				ni->type != AT_INDEX_ROOT) {
 			if (a->data.non_resident.compression_unit != 4) {
 				ntfs_error(vi->i_sb, "Found nonstandard "
 						"compression unit (%u instead "
@@ -1313,23 +1340,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 			ni->itype.compressed.size = sle64_to_cpu(
 					a->data.non_resident.compressed_size);
 		}
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				ntfs_error(vi->i_sb, "Found encrypted and "
-						"compressed data.");
-				goto unm_err_out;
-			}
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is encrypted.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
 		if (a->data.non_resident.lowest_vcn) {
 			ntfs_error(vi->i_sb, "First extent of attribute has "
 					"non-zero lowest_vcn.");
@@ -1348,12 +1358,12 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 		vi->i_mapping->a_ops = &ntfs_mst_aops;
 	else
 		vi->i_mapping->a_ops = &ntfs_aops;
-	if (NInoCompressed(ni) || NInoSparse(ni))
+	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
 		vi->i_blocks = ni->itype.compressed.size >> 9;
 	else
 		vi->i_blocks = ni->allocated_size >> 9;
 	/*
-	 * Make sure the base inode doesn't go away and attach it to the
+	 * Make sure the base inode does not go away and attach it to the
 	 * attribute inode.
 	 */
 	igrab(base_vi);
@@ -1480,7 +1490,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 				"after the attribute value.");
 		goto unm_err_out;
 	}
-	/* Compressed/encrypted/sparse index root is not allowed. */
+	/*
+	 * Compressed/encrypted/sparse index root is not allowed, except for
+	 * directories of course but those are not dealt with here.
+	 */
 	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
 			ATTR_IS_SPARSE)) {
 		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
@@ -2430,16 +2443,18 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 			 * We skipped the truncate but must still update
 			 * timestamps.
 			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
+			ia_valid |= ATTR_MTIME | ATTR_CTIME;
 		}
 	}
-
 	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = attr->ia_atime;
+		vi->i_atime = timespec_trunc(attr->ia_atime,
+				vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = attr->ia_mtime;
+		vi->i_mtime = timespec_trunc(attr->ia_mtime,
+				vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		vi->i_ctime = attr->ia_ctime;
+		vi->i_ctime = timespec_trunc(attr->ia_ctime,
+				vi->i_sb->s_time_gran);
 	mark_inode_dirty(vi);
 out:
 	return err;
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index a4bc07616e5..7b593429068 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -54,6 +54,8 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
 	int ret = 0;
 
 	ntfs_debug("Entering.");
+	if (!rl)
+		return 0;
 	for (; rl->length; rl++) {
 		int err;
 
@@ -163,17 +165,9 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 	BUG_ON(zone < FIRST_ZONE);
 	BUG_ON(zone > LAST_ZONE);
 
-	/* Return empty runlist if @count == 0 */
-	// FIXME: Do we want to just return NULL instead? (AIA)
-	if (!count) {
-		rl = ntfs_malloc_nofs(PAGE_SIZE);
-		if (!rl)
-			return ERR_PTR(-ENOMEM);
-		rl[0].vcn = start_vcn;
-		rl[0].lcn = LCN_RL_NOT_MAPPED;
-		rl[0].length = 0;
-		return rl;
-	}
+	/* Return NULL if @count is zero. */
+	if (!count)
+		return NULL;
 	/* Take the lcnbmp lock for writing. */
 	down_write(&vol->lcnbmp_lock);
 	/*
@@ -788,7 +782,8 @@ out:
  * @vi:		vfs inode whose runlist describes the clusters to free
  * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
- * @is_rollback:	if TRUE this is a rollback operation
+ * @write_locked:	true if the runlist is locked for writing
+ * @is_rollback:	true if this is a rollback operation
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
  * described by the vfs inode @vi.
@@ -806,17 +801,17 @@ out:
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
- *	      unlocked on return.
- *	    - This function takes the runlist lock of @vi for reading and
- *	      sometimes for writing and sometimes modifies the runlist.
+ * Locking: - The runlist described by @vi must be locked on entry and is
+ *	      locked on return.  Note if the runlist is locked for reading the
+ *	      lock may be dropped and reacquired.  Note the runlist may be
+ *	      modified when needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
  */
 s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
-		const BOOL is_rollback)
+		const BOOL write_locked, const BOOL is_rollback)
 {
 	s64 delta, to_free, total_freed, real_freed;
 	ntfs_inode *ni;
@@ -848,8 +843,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 	total_freed = real_freed = 0;
 
-	down_read(&ni->runlist.lock);
-	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, FALSE);
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, write_locked);
 	if (IS_ERR(rl)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "Failed to find first runlist "
@@ -903,7 +897,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 			/* Attempt to map runlist. */
 			vcn = rl->vcn;
-			rl = ntfs_attr_find_vcn_nolock(ni, vcn, FALSE);
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, write_locked);
 			if (IS_ERR(rl)) {
 				err = PTR_ERR(rl);
 				if (!is_rollback)
@@ -950,7 +944,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 		/* Update the total done clusters. */
 		total_freed += to_free;
 	}
-	up_read(&ni->runlist.lock);
 	if (likely(!is_rollback))
 		up_write(&vol->lcnbmp_lock);
 
@@ -960,7 +953,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 	ntfs_debug("Done.");
 	return real_freed;
 err_out:
-	up_read(&ni->runlist.lock);
 	if (is_rollback)
 		return err;
 	/* If no real clusters were freed, no need to rollback. */
@@ -973,7 +965,8 @@ err_out:
 	 * If rollback fails, set the volume errors flag, emit an error
 	 * message, and return the error code.
 	 */
-	delta = __ntfs_cluster_free(vi, start_vcn, total_freed, TRUE);
+	delta = __ntfs_cluster_free(vi, start_vcn, total_freed, write_locked,
+			TRUE);
 	if (delta < 0) {
 		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
 				"inconsistent metadata!  Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index 4cac1c024af..e4d7fb98d68 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -43,13 +43,14 @@ extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
 		const NTFS_CLUSTER_ALLOCATION_ZONES zone);
 
 extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count, const BOOL is_rollback);
+		s64 count, const BOOL write_locked, const BOOL is_rollback);
 
 /**
  * ntfs_cluster_free - free clusters on an ntfs volume
  * @vi:		vfs inode whose runlist describes the clusters to free
  * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
+ * @write_locked:	true if the runlist is locked for writing
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
  * described by the vfs inode @vi.
@@ -64,19 +65,19 @@ extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
- *	      unlocked on return.
- *	    - This function takes the runlist lock of @vi for reading and
- *	      sometimes for writing and sometimes modifies the runlist.
+ * Locking: - The runlist described by @vi must be locked on entry and is
+ *	      locked on return.  Note if the runlist is locked for reading the
+ *	      lock may be dropped and reacquired.  Note the runlist may be
+ *	      modified when needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
  */
 static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count)
+		s64 count, const BOOL write_locked)
 {
-	return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
+	return __ntfs_cluster_free(vi, start_vcn, count, write_locked, FALSE);
 }
 
 extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
@@ -93,8 +94,10 @@ extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  *
  * Return 0 on success and -errno on error.
  *
- * Locking: This function takes the volume lcn bitmap lock for writing and
- *	    modifies the bitmap contents.
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - The caller must have locked the runlist @rl for reading or
+ *	      writing.
  */
 static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
 		const runlist_element *rl)
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 8edb8e20fb0..0173e95500d 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -121,7 +121,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	 */
 	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
-				"chkdsk but a chkdsk LSN is specified.");
+				"by chkdsk but a chkdsk LSN is specified.");
 		return FALSE;
 	}
 	ntfs_debug("Done.");
@@ -312,10 +312,12 @@ err_out:
  * @vi:		$LogFile inode to which the restart page belongs
  * @rp:		restart page to check
  * @pos:	position in @vi at which the restart page resides
- * @wrp:	copy of the multi sector transfer deprotected restart page
+ * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn:	[OUT] set to the current logfile lsn on success
  *
- * Check the restart page @rp for consistency and return TRUE if it is
- * consistent and FALSE otherwise.
+ * Check the restart page @rp for consistency and return 0 if it is consistent
+ * and -errno otherwise.  The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
  *
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
@@ -323,25 +325,33 @@ err_out:
  * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
  * copy of the complete multi sector transfer deprotected page.  On failure,
  * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current
+ * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ *	-EINVAL	- The restart page is inconsistent.
+ *	-ENOMEM	- Not enough memory to load the restart page.
+ *	-EIO	- Failed to reading from $LogFile.
  */
-static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp)
+static int ntfs_check_and_load_restart_page(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+		LSN *lsn)
 {
 	RESTART_AREA *ra;
 	RESTART_PAGE_HEADER *trp;
-	int size;
-	BOOL ret;
+	int size, err;
 
 	ntfs_debug("Entering.");
 	/* Check the restart page header for consistency. */
 	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
 		/* Error output already done inside the function. */
-		return FALSE;
+		return -EINVAL;
 	}
 	/* Check the restart area for consistency. */
 	if (!ntfs_check_restart_area(vi, rp)) {
 		/* Error output already done inside the function. */
-		return FALSE;
+		return -EINVAL;
 	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
@@ -352,7 +362,7 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 	if (!trp) {
 		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
 				"restart page buffer.");
-		return FALSE;
+		return -ENOMEM;
 	}
 	/*
 	 * Read the whole of the restart page into the buffer.  If it fits
@@ -379,6 +389,9 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 			if (IS_ERR(page)) {
 				ntfs_error(vi->i_sb, "Error mapping $LogFile "
 						"page (index %lu).", idx);
+				err = PTR_ERR(page);
+				if (err != -EIO && err != -ENOMEM)
+					err = -EIO;
 				goto err_out;
 			}
 			size = min_t(int, to_read, PAGE_CACHE_SIZE);
@@ -392,29 +405,57 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 	/* Perform the multi sector transfer deprotection on the buffer. */
 	if (post_read_mst_fixup((NTFS_RECORD*)trp,
 			le32_to_cpu(rp->system_page_size))) {
-		ntfs_error(vi->i_sb, "Multi sector transfer error detected in "
-				"$LogFile restart page.");
-		goto err_out;
+		/*
+		 * A multi sector tranfer error was detected.  We only need to
+		 * abort if the restart page contents exceed the multi sector
+		 * transfer fixup of the first sector.
+		 */
+		if (le16_to_cpu(rp->restart_area_offset) +
+				le16_to_cpu(ra->restart_area_length) >
+				NTFS_BLOCK_SIZE - sizeof(u16)) {
+			ntfs_error(vi->i_sb, "Multi sector transfer error "
+					"detected in $LogFile restart page.");
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	/*
+	 * If the restart page is modified by chkdsk or there are no active
+	 * logfile clients, the logfile is consistent.  Otherwise, need to
+	 * check the log client records for consistency, too.
+	 */
+	err = 0;
+	if (ntfs_is_rstr_record(rp->magic) &&
+			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+		if (!ntfs_check_log_client_array(vi, trp)) {
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	if (lsn) {
+		if (ntfs_is_rstr_record(rp->magic))
+			*lsn = sle64_to_cpu(ra->current_lsn);
+		else /* if (ntfs_is_chkd_record(rp->magic)) */
+			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
 	}
-	/* Check the log client records for consistency. */
-	ret = ntfs_check_log_client_array(vi, trp);
-	if (ret && wrp)
-		*wrp = trp;
-	else
-		ntfs_free(trp);
 	ntfs_debug("Done.");
-	return ret;
+	if (wrp)
+		*wrp = trp;
+	else {
 err_out:
-	ntfs_free(trp);
-	return FALSE;
+		ntfs_free(trp);
+	}
+	return err;
 }
 
 /**
  * ntfs_check_logfile - check the journal for consistency
  * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		[OUT] on success this is a copy of the current restart page
  *
  * Check the $LogFile journal for consistency and return TRUE if it is
- * consistent and FALSE if not.
+ * consistent and FALSE if not.  On success, the current restart page is
+ * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
  *
  * At present we only check the two restart pages and ignore the log record
  * pages.
@@ -424,19 +465,18 @@ err_out:
  * if the $LogFile was created on a system with a different page size to ours
  * yet and mst deprotection would fail if our page size is smaller.
  */
-BOOL ntfs_check_logfile(struct inode *log_vi)
+BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 {
-	s64 size, pos, rstr1_pos, rstr2_pos;
+	s64 size, pos;
+	LSN rstr1_lsn, rstr2_lsn;
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
 	struct address_space *mapping = log_vi->i_mapping;
 	struct page *page = NULL;
 	u8 *kaddr = NULL;
 	RESTART_PAGE_HEADER *rstr1_ph = NULL;
 	RESTART_PAGE_HEADER *rstr2_ph = NULL;
-	int log_page_size, log_page_mask, ofs;
+	int log_page_size, log_page_mask, err;
 	BOOL logfile_is_empty = TRUE;
-	BOOL rstr1_found = FALSE;
-	BOOL rstr2_found = FALSE;
 	u8 log_page_bits;
 
 	ntfs_debug("Entering.");
@@ -491,7 +531,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Error mapping $LogFile "
 						"page (index %lu).", idx);
-				return FALSE;
+				goto err_out;
 			}
 		}
 		kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
@@ -510,99 +550,95 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 		 */
 		if (ntfs_is_rcrd_recordp((le32*)kaddr))
 			break;
-		/*
-		 * A modified by chkdsk restart page means we cannot handle
-		 * this log file.
-		 */
-		if (ntfs_is_chkd_recordp((le32*)kaddr)) {
-			ntfs_error(vol->sb, "$LogFile has been modified by "
-					"chkdsk.  Mount this volume in "
-					"Windows.");
-			goto err_out;
-		}
-		/* If not a restart page, continue. */
-		if (!ntfs_is_rstr_recordp((le32*)kaddr)) {
-			/* Skip to the minimum page size for the next one. */
+		/* If not a (modified by chkdsk) restart page, continue. */
+		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
+				!ntfs_is_chkd_recordp((le32*)kaddr)) {
 			if (!pos)
 				pos = NTFS_BLOCK_SIZE >> 1;
 			continue;
 		}
-		/* We now know we have a restart page. */
-		if (!pos) {
-			rstr1_found = TRUE;
-			rstr1_pos = pos;
-		} else {
-			if (rstr2_found) {
-				ntfs_error(vol->sb, "Found more than two "
-						"restart pages in $LogFile.");
-				goto err_out;
-			}
-			rstr2_found = TRUE;
-			rstr2_pos = pos;
-		}
 		/*
-		 * Check the restart page for consistency and get a copy of the
-		 * complete multi sector transfer deprotected restart page.
+		 * Check the (modified by chkdsk) restart page for consistency
+		 * and get a copy of the complete multi sector transfer
+		 * deprotected restart page.
 		 */
-		if (!ntfs_check_and_load_restart_page(log_vi,
+		err = ntfs_check_and_load_restart_page(log_vi,
 				(RESTART_PAGE_HEADER*)kaddr, pos,
-				!pos ? &rstr1_ph : &rstr2_ph)) {
-			/* Error output already done inside the function. */
-			goto err_out;
+				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
+				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
+		if (!err) {
+			/*
+			 * If we have now found the first (modified by chkdsk)
+			 * restart page, continue looking for the second one.
+			 */
+			if (!pos) {
+				pos = NTFS_BLOCK_SIZE >> 1;
+				continue;
+			}
+			/*
+			 * We have now found the second (modified by chkdsk)
+			 * restart page, so we can stop looking.
+			 */
+			break;
 		}
 		/*
-		 * We have a valid restart page.  The next one must be after
-		 * a whole system page size as specified by the valid restart
-		 * page.
+		 * Error output already done inside the function.  Note, we do
+		 * not abort if the restart page was invalid as we might still
+		 * find a valid one further in the file.
 		 */
+		if (err != -EINVAL) {
+			ntfs_unmap_page(page);
+			goto err_out;
+		}
+		/* Continue looking. */
 		if (!pos)
-			pos = le32_to_cpu(rstr1_ph->system_page_size) >> 1;
+			pos = NTFS_BLOCK_SIZE >> 1;
 	}
-	if (page) {
+	if (page)
 		ntfs_unmap_page(page);
-		page = NULL;
-	}
 	if (logfile_is_empty) {
 		NVolSetLogFileEmpty(vol);
 is_empty:
 		ntfs_debug("Done.  ($LogFile is empty.)");
 		return TRUE;
 	}
-	if (!rstr1_found || !rstr2_found) {
-		ntfs_error(vol->sb, "Did not find two restart pages in "
-				"$LogFile.");
-		goto err_out;
+	if (!rstr1_ph) {
+		BUG_ON(rstr2_ph);
+		ntfs_error(vol->sb, "Did not find any restart pages in "
+				"$LogFile and it was not empty.");
+		return FALSE;
+	}
+	/* If both restart pages were found, use the more recent one. */
+	if (rstr2_ph) {
+		/*
+		 * If the second restart area is more recent, switch to it.
+		 * Otherwise just throw it away.
+		 */
+		if (rstr2_lsn > rstr1_lsn) {
+			ntfs_free(rstr1_ph);
+			rstr1_ph = rstr2_ph;
+			/* rstr1_lsn = rstr2_lsn; */
+		} else
+			ntfs_free(rstr2_ph);
+		rstr2_ph = NULL;
 	}
-	/*
-	 * The two restart areas must be identical except for the update
-	 * sequence number.
-	 */
-	ofs = le16_to_cpu(rstr1_ph->usa_ofs);
-	if (memcmp(rstr1_ph, rstr2_ph, ofs) || (ofs += sizeof(u16),
-			memcmp((u8*)rstr1_ph + ofs, (u8*)rstr2_ph + ofs,
-			le32_to_cpu(rstr1_ph->system_page_size) - ofs))) {
-		ntfs_error(vol->sb, "The two restart pages in $LogFile do not "
-				"match.");
-		goto err_out;
-	}
-	ntfs_free(rstr1_ph);
-	ntfs_free(rstr2_ph);
 	/* All consistency checks passed. */
+	if (rp)
+		*rp = rstr1_ph;
+	else
+		ntfs_free(rstr1_ph);
 	ntfs_debug("Done.");
 	return TRUE;
 err_out:
-	if (page)
-		ntfs_unmap_page(page);
 	if (rstr1_ph)
 		ntfs_free(rstr1_ph);
-	if (rstr2_ph)
-		ntfs_free(rstr2_ph);
 	return FALSE;
 }
 
 /**
  * ntfs_is_logfile_clean - check in the journal if the volume is clean
  * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		copy of the current restart page
  *
  * Analyze the $LogFile journal and return TRUE if it indicates the volume was
  * shutdown cleanly and FALSE if not.
@@ -619,11 +655,9 @@ err_out:
  * is empty this function requires that NVolLogFileEmpty() is true otherwise an
  * empty volume will be reported as dirty.
  */
-BOOL ntfs_is_logfile_clean(struct inode *log_vi)
+BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 {
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	struct page *page;
-	RESTART_PAGE_HEADER *rp;
 	RESTART_AREA *ra;
 
 	ntfs_debug("Entering.");
@@ -632,24 +666,15 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
 		ntfs_debug("Done.  ($LogFile is empty.)");
 		return TRUE;
 	}
-	/*
-	 * Read the first restart page.  It will be possibly incomplete and
-	 * will not be multi sector transfer deprotected but we only need the
-	 * first NTFS_BLOCK_SIZE bytes so it does not matter.
-	 */
-	page = ntfs_map_page(log_vi->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Error mapping $LogFile page (index 0).");
+	BUG_ON(!rp);
+	if (!ntfs_is_rstr_record(rp->magic) &&
+			!ntfs_is_chkd_record(rp->magic)) {
+		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
+				"probably a bug in that the $LogFile should "
+				"have been consistency checked before calling "
+				"this function.");
 		return FALSE;
 	}
-	rp = (RESTART_PAGE_HEADER*)page_address(page);
-	if (!ntfs_is_rstr_record(rp->magic)) {
-		ntfs_error(vol->sb, "No restart page found at offset zero in "
-				"$LogFile.  This is probably a bug in that "
-				"the $LogFile should have been consistency "
-				"checked before calling this function.");
-		goto err_out;
-	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
 	 * If the $LogFile has active clients, i.e. it is open, and we do not
@@ -659,15 +684,11 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
 	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
 			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
 		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-		goto err_out;
+		return FALSE;
 	}
-	ntfs_unmap_page(page);
 	/* $LogFile indicates a clean shutdown. */
 	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
 	return TRUE;
-err_out:
-	ntfs_unmap_page(page);
-	return FALSE;
 }
 
 /**
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 4ee4378de06..42388f95ea6 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -2,7 +2,7 @@
  * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
  *	       the Linux-NTFS project.
  *
- * Copyright (c) 2000-2004 Anton Altaparmakov
+ * Copyright (c) 2000-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -296,9 +296,11 @@ typedef struct {
 /* sizeof() = 160 (0xa0) bytes */
 } __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
 
-extern BOOL ntfs_check_logfile(struct inode *log_vi);
+extern BOOL ntfs_check_logfile(struct inode *log_vi,
+		RESTART_PAGE_HEADER **rp);
 
-extern BOOL ntfs_is_logfile_clean(struct inode *log_vi);
+extern BOOL ntfs_is_logfile_clean(struct inode *log_vi,
+		const RESTART_PAGE_HEADER *rp);
 
 extern BOOL ntfs_empty_logfile(struct inode *log_vi);
 
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index fac5944df6d..9994e019a3c 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -27,27 +27,63 @@
 #include <linux/highmem.h>
 
 /**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size	number of bytes to allocate
+ * __ntfs_malloc - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ * @gfp_mask:	extra flags for the allocator
+ *
+ * Internal function.  You probably want ntfs_malloc_nofs()...
  *
  * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
  * returns a pointer to the allocated memory.
  *
  * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
  */
-static inline void *ntfs_malloc_nofs(unsigned long size)
+static inline void *__ntfs_malloc(unsigned long size,
+		unsigned int __nocast gfp_mask)
 {
 	if (likely(size <= PAGE_SIZE)) {
 		BUG_ON(!size);
 		/* kmalloc() has per-CPU caches so is faster for now. */
-		return kmalloc(PAGE_SIZE, GFP_NOFS);
-		/* return (void *)__get_free_page(GFP_NOFS | __GFP_HIGHMEM); */
+		return kmalloc(PAGE_SIZE, gfp_mask);
+		/* return (void *)__get_free_page(gfp_mask); */
 	}
 	if (likely(size >> PAGE_SHIFT < num_physpages))
-		return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }
 
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
+
 static inline void ntfs_free(void *addr)
 {
 	if (likely(((unsigned long)addr < VMALLOC_START) ||
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 317f7c679fd..2c32b84385a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -511,7 +511,6 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 		} while (bh);
 		tail->b_this_page = head;
 		attach_page_buffers(page, head);
-		BUG_ON(!page_has_buffers(page));
 	}
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
@@ -692,7 +691,6 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	 */
 	if (!NInoTestClearDirty(ni))
 		goto done;
-	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 	rl = NULL;
@@ -1955,7 +1953,7 @@ restore_undo_alloc:
 	a = ctx->attr;
 	a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
 undo_alloc:
-	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
+	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1, TRUE) < 0) {
 		ntfs_error(vol->sb, "Failed to free clusters from mft data "
 				"attribute.%s", es);
 		NVolSetErrors(vol);
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 758855b0414..f5b2ac92908 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -35,7 +35,7 @@ static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
 		int size)
 {
 	if (likely((dst != src) && (size > 0)))
-		memmove(base + dst, base + src, size * sizeof (*base));
+		memmove(base + dst, base + src, size * sizeof(*base));
 }
 
 /**
@@ -95,6 +95,51 @@ static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
 }
 
 /**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs_nofail(new_size);
+	BUG_ON(!new_rl);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
  * ntfs_are_rl_mergeable - test if two runlists can be joined together
  * @dst:	original runlist
  * @src:	new runlist to test for mergeability with @dst
@@ -497,6 +542,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			/* Scan to the end of the source runlist. */
 			for (dend = 0; likely(drl[dend].length); dend++)
 				;
+			dend++;
 			drl = ntfs_rl_realloc(drl, dend, dend + 1);
 			if (IS_ERR(drl))
 				return drl;
@@ -566,8 +612,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
 		  (srl[send - 1].vcn + srl[send - 1].length)));
 
-	/* Or we'll lose an end marker */
-	if (start && finish && (drl[dins].length == 0))
+	/* Or we will lose an end marker. */
+	if (finish && !drl[dins].length)
 		ss++;
 	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
 		finish = FALSE;
@@ -621,11 +667,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
 				/* Add an unmapped runlist element. */
 				if (!slots) {
-					/* FIXME/TODO: We need to have the
-					 * extra memory already! (AIA) */
-					drl = ntfs_rl_realloc(drl, ds, ds + 2);
-					if (!drl)
-						goto critical_error;
+					drl = ntfs_rl_realloc_nofail(drl, ds,
+							ds + 2);
 					slots = 2;
 				}
 				ds++;
@@ -640,13 +683,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			drl[ds].length = marker_vcn - drl[ds].vcn;
 			/* Finally add the ENOENT terminator. */
 			ds++;
-			if (!slots) {
-				/* FIXME/TODO: We need to have the extra
-				 * memory already! (AIA) */
-				drl = ntfs_rl_realloc(drl, ds, ds + 1);
-				if (!drl)
-					goto critical_error;
-			}
+			if (!slots)
+				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
 			drl[ds].vcn = marker_vcn;
 			drl[ds].lcn = LCN_ENOENT;
 			drl[ds].length = (s64)0;
@@ -659,11 +697,6 @@ finished:
 	ntfs_debug("Merged runlist:");
 	ntfs_debug_dump_runlist(drl);
 	return drl;
-
-critical_error:
-	/* Critical error! We cannot afford to fail here. */
-	ntfs_error(NULL, "Critical error! Not enough memory.");
-	panic("NTFS: Cannot continue.");
 }
 
 /**
@@ -727,6 +760,9 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
 		ntfs_error(vol->sb, "Corrupt attribute.");
 		return ERR_PTR(-EIO);
 	}
+	/* If the mapping pairs array is valid but empty, nothing to do. */
+	if (!vcn && !*buf)
+		return old_rl;
 	/* Current position in runlist array. */
 	rlpos = 0;
 	/* Allocate first page and set current runlist size to one page. */
@@ -1419,6 +1455,7 @@ err_out:
 
 /**
  * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol:	ntfs volume (needed for error output)
  * @runlist:	runlist to truncate
  * @new_length:	the new length of the runlist in VCNs
  *
@@ -1426,12 +1463,16 @@ err_out:
  * holding the runlist elements to a length of @new_length VCNs.
  *
  * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded.
+ * @new_length and above are discarded.  As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
  *
  * If @new_length lies beyond the runlist, a sparse runlist element is added to
  * the end of the runlist @runlist or if the last runlist element is a sparse
  * one already, this is extended.
  *
+ * Note, no checking is done for unmapped runlist elements.  It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
  * Return 0 on success and -errno on error.
  *
  * Locking: The caller must hold @runlist->lock for writing.
@@ -1446,6 +1487,13 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	BUG_ON(!runlist);
 	BUG_ON(new_length < 0);
 	rl = runlist->rl;
+	if (!new_length) {
+		ntfs_debug("Freeing runlist.");
+		runlist->rl = NULL;
+		if (rl)
+			ntfs_free(rl);
+		return 0;
+	}
 	if (unlikely(!rl)) {
 		/*
 		 * Create a runlist consisting of a sparse runlist element of
@@ -1553,4 +1601,288 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	return 0;
 }
 
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to punch a hole into
+ * @start:	starting VCN of the hole to be created
+ * @length:	size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length)
+{
+	const VCN end = start + length;
+	s64 delta;
+	runlist_element *rl, *rl_end, *rl_real_end, *trl;
+	int old_size;
+	BOOL lcn_fixup = FALSE;
+
+	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+			(long long)start, (long long)length);
+	BUG_ON(!runlist);
+	BUG_ON(start < 0);
+	BUG_ON(length < 0);
+	BUG_ON(end < 0);
+	rl = runlist->rl;
+	if (unlikely(!rl)) {
+		if (likely(!start && !length))
+			return 0;
+		return -ENOENT;
+	}
+	/* Find @start in the runlist. */
+	while (likely(rl->length && start >= rl[1].vcn))
+		rl++;
+	rl_end = rl;
+	/* Find @end in the runlist. */
+	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+		/* Verify there are no unmapped or error elements. */
+		if (unlikely(rl_end->lcn < LCN_HOLE))
+			return -EINVAL;
+		rl_end++;
+	}
+	/* Check the last element. */
+	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+		return -EINVAL;
+	/* This covers @start being out of bounds, too. */
+	if (!rl_end->length && end > rl_end->vcn)
+		return -ENOENT;
+	if (!length)
+		return 0;
+	if (!rl->length)
+		return -ENOENT;
+	rl_real_end = rl_end;
+	/* Determine the runlist size. */
+	while (likely(rl_real_end->length))
+		rl_real_end++;
+	old_size = rl_real_end - runlist->rl + 1;
+	/* If @start is in a hole simply extend the hole. */
+	if (rl->lcn == LCN_HOLE) {
+		/*
+		 * If both @start and @end are in the same sparse run, we are
+		 * done.
+		 */
+		if (end <= rl[1].vcn) {
+			ntfs_debug("Done (requested hole is already sparse).");
+			return 0;
+		}
+extend_hole:
+		/* Extend the hole. */
+		rl->length = end - rl->vcn;
+		/* If @end is in a hole, merge it with the current one. */
+		if (rl_end->lcn == LCN_HOLE) {
+			rl_end++;
+			rl->length = rl_end->vcn - rl->vcn;
+		}
+		/* We have done the hole.  Now deal with the remaining tail. */
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Adjust the beginning of the tail if necessary. */
+		if (end > rl->vcn) {
+			s64 delta = end - rl->vcn;
+			rl->vcn = end;
+			rl->length -= delta;
+			/* Only adjust the lcn if it is real. */
+			if (rl->lcn >= 0)
+				rl->lcn += delta;
+		}
+shrink_allocation:
+		/* Reallocate memory if the allocation changed. */
+		if (rl < rl_end) {
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size - (rl_end - rl));
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+		ntfs_debug("Done (extend hole).");
+		return 0;
+	}
+	/*
+	 * If @start is at the beginning of a run things are easier as there is
+	 * no need to split the first run.
+	 */
+	if (start == rl->vcn) {
+		/*
+		 * @start is at the beginning of a run.
+		 *
+		 * If the previous run is sparse, extend its hole.
+		 *
+		 * If @end is not in the same run, switch the run to be sparse
+		 * and extend the newly created hole.
+		 *
+		 * Thus both of these cases reduce the problem to the above
+		 * case of "@start is in a hole".
+		 */
+		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+			rl--;
+			goto extend_hole;
+		}
+		if (end >= rl[1].vcn) {
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		/*
+		 * The final case is when @end is in the same run as @start.
+		 * For this need to split the run into two.  One run for the
+		 * sparse region between the beginning of the old run, i.e.
+		 * @start, and @end and one for the remaining non-sparse
+		 * region, i.e. between @end and the end of the old run.
+		 */
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+split_end:
+		/* Shift all the runs up by one. */
+		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+		/* Finally, setup the two split runs. */
+		rl->lcn = LCN_HOLE;
+		rl->length = length;
+		rl++;
+		rl->vcn += length;
+		/* Only adjust the lcn if it is real. */
+		if (rl->lcn >= 0 || lcn_fixup)
+			rl->lcn += length;
+		rl->length -= length;
+		ntfs_debug("Done (split one).");
+		return 0;
+	}
+	/*
+	 * @start is neither in a hole nor at the beginning of a run.
+	 *
+	 * If @end is in a hole, things are easier as simply truncating the run
+	 * @start is in to end at @start - 1, deleting all runs after that up
+	 * to @end, and finally extending the beginning of the run @end is in
+	 * to be @start is all that is needed.
+	 */
+	if (rl_end->lcn == LCN_HOLE) {
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Extend the beginning of the run @end is in to be @start. */
+		rl->vcn = start;
+		rl->length = rl[1].vcn - start;
+		goto shrink_allocation;
+	}
+	/* 
+	 * If @end is not in a hole there are still two cases to distinguish.
+	 * Either @end is or is not in the same run as @start.
+	 *
+	 * The second case is easier as it can be reduced to an already solved
+	 * problem by truncating the run @start is in to end at @start - 1.
+	 * Then, if @end is in the next run need to split the run into a sparse
+	 * run followed by a non-sparse run (already covered above) and if @end
+	 * is not in the next run switching it to be sparse, again reduces the
+	 * problem to the already covered case of "@start is in a hole".
+	 */
+	if (end >= rl[1].vcn) {
+		/*
+		 * If @end is not in the next run, reduce the problem to the
+		 * case of "@start is in a hole".
+		 */
+		if (rl[1].length && end >= rl[2].vcn) {
+			/* Truncate the run containing @start. */
+			rl->length = start - rl->vcn;
+			rl++;
+			rl->vcn = start;
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/*
+		 * @end is in the next run, reduce the problem to the case
+		 * where "@start is at the beginning of a run and @end is in
+		 * the same run as @start".
+		 */
+		delta = rl->vcn - start;
+		rl->vcn = start;
+		if (rl->lcn >= 0) {
+			rl->lcn -= delta;
+			/* Need this in case the lcn just became negative. */
+			lcn_fixup = TRUE;
+		}
+		rl->length += delta;
+		goto split_end;
+	}
+	/*
+	 * The first case from above, i.e. @end is in the same run as @start.
+	 * We need to split the run into three.  One run for the non-sparse
+	 * region between the beginning of the old run and @start, one for the
+	 * sparse region between @start and @end, and one for the remaining
+	 * non-sparse region, i.e. between @end and the end of the old run.
+	 */
+	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+	if (IS_ERR(trl))
+		goto enomem_out;
+	old_size += 2;
+	if (runlist->rl != trl) {
+		rl = trl + (rl - runlist->rl);
+		rl_end = trl + (rl_end - runlist->rl);
+		rl_real_end = trl + (rl_real_end - runlist->rl);
+		runlist->rl = trl;
+	}
+	/* Shift all the runs up by two. */
+	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+	/* Finally, setup the three split runs. */
+	rl->length = start - rl->vcn;
+	rl++;
+	rl->vcn = start;
+	rl->lcn = LCN_HOLE;
+	rl->length = length;
+	rl++;
+	delta = end - rl->vcn;
+	rl->vcn = end;
+	rl->lcn += delta;
+	rl->length -= delta;
+	ntfs_debug("Done (split both).");
+	return 0;
+enomem_out:
+	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+	return -ENOMEM;
+}
+
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index aa0ee6540e7..47728fbb610 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h
@@ -94,6 +94,9 @@ extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
 		runlist *const runlist, const s64 new_length);
 
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length);
+
 #endif /* NTFS_RW */
 
 #endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 41aa8eb6755..b2b39296126 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1133,7 +1133,8 @@ mft_unmap_out:
  *
  * Return TRUE on success or FALSE on error.
  */
-static BOOL load_and_check_logfile(ntfs_volume *vol)
+static BOOL load_and_check_logfile(ntfs_volume *vol,
+		RESTART_PAGE_HEADER **rp)
 {
 	struct inode *tmp_ino;
 
@@ -1145,7 +1146,7 @@ static BOOL load_and_check_logfile(ntfs_volume *vol)
 		/* Caller will display error message. */
 		return FALSE;
 	}
-	if (!ntfs_check_logfile(tmp_ino)) {
+	if (!ntfs_check_logfile(tmp_ino, rp)) {
 		iput(tmp_ino);
 		/* ntfs_check_logfile() will have displayed error output. */
 		return FALSE;
@@ -1689,6 +1690,7 @@ static BOOL load_system_files(ntfs_volume *vol)
 	VOLUME_INFORMATION *vi;
 	ntfs_attr_search_ctx *ctx;
 #ifdef NTFS_RW
+	RESTART_PAGE_HEADER *rp;
 	int err;
 #endif /* NTFS_RW */
 
@@ -1841,8 +1843,9 @@ get_ctx_vol_failed:
 	 * Get the inode for the logfile, check it and determine if the volume
 	 * was shutdown cleanly.
 	 */
-	if (!load_and_check_logfile(vol) ||
-			!ntfs_is_logfile_clean(vol->logfile_ino)) {
+	rp = NULL;
+	if (!load_and_check_logfile(vol, &rp) ||
+			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
 		static const char *es1a = "Failed to load $LogFile";
 		static const char *es1b = "$LogFile is not clean";
 		static const char *es2 = ".  Mount in Windows.";
@@ -1857,6 +1860,10 @@ get_ctx_vol_failed:
 						"continue nor on_errors="
 						"remount-ro was specified%s",
 						es1, es2);
+				if (vol->logfile_ino) {
+					BUG_ON(!rp);
+					ntfs_free(rp);
+				}
 				goto iput_logfile_err_out;
 			}
 			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
@@ -1867,6 +1874,7 @@ get_ctx_vol_failed:
 		/* This will prevent a read-write remount. */
 		NVolSetErrors(vol);
 	}
+	ntfs_free(rp);
 #endif /* NTFS_RW */
 	/* Get the root directory inode so we can do path lookups. */
 	vol->root_ino = ntfs_iget(sb, FILE_root);
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index 19c42e231b4..a389a5a16c8 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -372,7 +372,8 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
 	return -EINVAL;
 conversion_err:
 	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
-			"converted to character set %s.", nls->charset);
+			"converted to character set %s.  You might want to "
+			"try to use the mount option nls=utf8.", nls->charset);
 	if (ns != *outs)
 		kfree(ns);
 	if (wc != -ENAMETOOLONG)
diff --git a/fs/open.c b/fs/open.c
index 32bf05e2996..2fac58c5191 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -24,6 +24,7 @@
 #include <linux/personality.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/rcupdate.h>
 
 #include <asm/unistd.h>
 
@@ -842,14 +843,16 @@ int get_unused_fd(void)
 {
 	struct files_struct * files = current->files;
 	int fd, error;
+	struct fdtable *fdt;
 
   	error = -EMFILE;
 	spin_lock(&files->file_lock);
 
 repeat:
- 	fd = find_next_zero_bit(files->open_fds->fds_bits, 
-				files->max_fdset, 
-				files->next_fd);
+	fdt = files_fdtable(files);
+ 	fd = find_next_zero_bit(fdt->open_fds->fds_bits,
+				fdt->max_fdset,
+				fdt->next_fd);
 
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
@@ -872,14 +875,14 @@ repeat:
 		goto repeat;
 	}
 
-	FD_SET(fd, files->open_fds);
-	FD_CLR(fd, files->close_on_exec);
-	files->next_fd = fd + 1;
+	FD_SET(fd, fdt->open_fds);
+	FD_CLR(fd, fdt->close_on_exec);
+	fdt->next_fd = fd + 1;
 #if 1
 	/* Sanity check */
-	if (files->fd[fd] != NULL) {
+	if (fdt->fd[fd] != NULL) {
 		printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
-		files->fd[fd] = NULL;
+		fdt->fd[fd] = NULL;
 	}
 #endif
 	error = fd;
@@ -893,9 +896,10 @@ EXPORT_SYMBOL(get_unused_fd);
 
 static inline void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
-	__FD_CLR(fd, files->open_fds);
-	if (fd < files->next_fd)
-		files->next_fd = fd;
+	struct fdtable *fdt = files_fdtable(files);
+	__FD_CLR(fd, fdt->open_fds);
+	if (fd < fdt->next_fd)
+		fdt->next_fd = fd;
 }
 
 void fastcall put_unused_fd(unsigned int fd)
@@ -924,25 +928,21 @@ EXPORT_SYMBOL(put_unused_fd);
 void fastcall fd_install(unsigned int fd, struct file * file)
 {
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 	spin_lock(&files->file_lock);
-	if (unlikely(files->fd[fd] != NULL))
-		BUG();
-	files->fd[fd] = file;
+	fdt = files_fdtable(files);
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
 	spin_unlock(&files->file_lock);
 }
 
 EXPORT_SYMBOL(fd_install);
 
-asmlinkage long sys_open(const char __user * filename, int flags, int mode)
+long do_sys_open(const char __user *filename, int flags, int mode)
 {
-	char * tmp;
-	int fd;
+	char *tmp = getname(filename);
+	int fd = PTR_ERR(tmp);
 
-	if (force_o_largefile())
-		flags |= O_LARGEFILE;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd();
 		if (fd >= 0) {
@@ -959,6 +959,14 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode)
 	}
 	return fd;
 }
+
+asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(filename, flags, mode);
+}
 EXPORT_SYMBOL_GPL(sys_open);
 
 #ifndef __alpha__
@@ -1007,15 +1015,17 @@ asmlinkage long sys_close(unsigned int fd)
 {
 	struct file * filp;
 	struct files_struct *files = current->files;
+	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
-	if (fd >= files->max_fds)
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
 		goto out_unlock;
-	filp = files->fd[fd];
+	filp = fdt->fd[fd];
 	if (!filp)
 		goto out_unlock;
-	files->fd[fd] = NULL;
-	FD_CLR(fd, files->close_on_exec);
+	rcu_assign_pointer(fdt->fd[fd], NULL);
+	FD_CLR(fd, fdt->close_on_exec);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
 	return filp_close(filp, files);
diff --git a/fs/pipe.c b/fs/pipe.c
index 25aa09f9d09..2c7a23dde2d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -415,6 +415,10 @@ pipe_poll(struct file *filp, poll_table *wait)
 
 	if (filp->f_mode & FMODE_WRITE) {
 		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+		/*
+		 * Most Unices do not set POLLERR for FIFOs but on Linux they
+		 * behave exactly like pipes for poll().
+		 */
 		if (!PIPE_READERS(*inode))
 			mask |= POLLERR;
 	}
@@ -422,9 +426,6 @@ pipe_poll(struct file *filp, poll_table *wait)
 	return mask;
 }
 
-/* FIXME: most Unices do not set POLLERR for fifos */
-#define fifo_poll pipe_poll
-
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
@@ -568,7 +569,7 @@ struct file_operations read_fifo_fops = {
 	.read		= pipe_read,
 	.readv		= pipe_readv,
 	.write		= bad_pipe_w,
-	.poll		= fifo_poll,
+	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_read_open,
 	.release	= pipe_read_release,
@@ -580,7 +581,7 @@ struct file_operations write_fifo_fops = {
 	.read		= bad_pipe_r,
 	.write		= pipe_write,
 	.writev		= pipe_writev,
-	.poll		= fifo_poll,
+	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
 	.release	= pipe_write_release,
@@ -593,7 +594,7 @@ struct file_operations rdwr_fifo_fops = {
 	.readv		= pipe_readv,
 	.write		= pipe_write,
 	.writev		= pipe_writev,
-	.poll		= fifo_poll,
+	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
 	.release	= pipe_rdwr_release,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 37668fe998a..d88d518d30f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 {
 	struct group_info *group_info;
 	int g;
+	struct fdtable *fdt = NULL;
 
 	read_lock(&tasklist_lock);
 	buffer += sprintf(buffer,
@@ -179,10 +180,12 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		p->gid, p->egid, p->sgid, p->fsgid);
 	read_unlock(&tasklist_lock);
 	task_lock(p);
+	if (p->files)
+		fdt = files_fdtable(p->files);
 	buffer += sprintf(buffer,
 		"FDSize:\t%d\n"
 		"Groups:\t",
-		p->files ? p->files->max_fds : 0);
+		fdt ? fdt->max_fds : 0);
 
 	group_info = p->group_info;
 	get_group_info(group_info);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 491f2d9f89a..23db452ab42 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -11,6 +11,40 @@
  *  go into icache. We cache the reference to task_struct upon lookup too.
  *  Eventually it should become a filesystem in its own. We don't use the
  *  rest of procfs anymore.
+ *
+ *
+ *  Changelog:
+ *  17-Jan-2005
+ *  Allan Bezerra
+ *  Bruna Moreira <bruna.moreira@indt.org.br>
+ *  Edjard Mota <edjard.mota@indt.org.br>
+ *  Ilias Biris <ilias.biris@indt.org.br>
+ *  Mauricio Lin <mauricio.lin@indt.org.br>
+ *
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ *  A new process specific entry (smaps) included in /proc. It shows the
+ *  size of rss for each memory area. The maps entry lacks information
+ *  about physical memory size (rss) for each mapped file, i.e.,
+ *  rss information for executables and library files.
+ *  This additional information is useful for any tools that need to know
+ *  about physical memory consumption for a process specific library.
+ *
+ *  Changelog:
+ *  21-Feb-2005
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *  Pud inclusion in the page table walking.
+ *
+ *  ChangeLog:
+ *  10-Mar-2005
+ *  10LE Instituto Nokia de Tecnologia - INdT:
+ *  A better way to walks through the page table as suggested by Hugh Dickins.
+ *
+ *  Simo Piiroinen <simo.piiroinen@nokia.com>:
+ *  Smaps information related to shared, private, clean and dirty pages.
+ *
+ *  Paul Mundt <paul.mundt@nokia.com>:
+ *  Overall revision about smaps.
  */
 
 #include <asm/uaccess.h>
@@ -28,6 +62,7 @@
 #include <linux/namespace.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
+#include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -65,8 +100,10 @@ enum pid_directory_inos {
 	PROC_TGID_STAT,
 	PROC_TGID_STATM,
 	PROC_TGID_MAPS,
+	PROC_TGID_NUMA_MAPS,
 	PROC_TGID_MOUNTS,
 	PROC_TGID_WCHAN,
+	PROC_TGID_SMAPS,
 #ifdef CONFIG_SCHEDSTATS
 	PROC_TGID_SCHEDSTAT,
 #endif
@@ -83,7 +120,6 @@ enum pid_directory_inos {
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TGID_LOGINUID,
 #endif
-	PROC_TGID_FD_DIR,
 	PROC_TGID_OOM_SCORE,
 	PROC_TGID_OOM_ADJUST,
 	PROC_TID_INO,
@@ -102,8 +138,10 @@ enum pid_directory_inos {
 	PROC_TID_STAT,
 	PROC_TID_STATM,
 	PROC_TID_MAPS,
+	PROC_TID_NUMA_MAPS,
 	PROC_TID_MOUNTS,
 	PROC_TID_WCHAN,
+	PROC_TID_SMAPS,
 #ifdef CONFIG_SCHEDSTATS
 	PROC_TID_SCHEDSTAT,
 #endif
@@ -120,9 +158,11 @@ enum pid_directory_inos {
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TID_LOGINUID,
 #endif
-	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
 	PROC_TID_OOM_SCORE,
 	PROC_TID_OOM_ADJUST,
+
+	/* Add new entries before this */
+	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
 };
 
 struct pid_entry {
@@ -144,6 +184,9 @@ static struct pid_entry tgid_base_stuff[] = {
 	E(PROC_TGID_STAT,      "stat",    S_IFREG|S_IRUGO),
 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
+#ifdef CONFIG_NUMA
+	E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO),
+#endif
 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
 #ifdef CONFIG_SECCOMP
 	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
@@ -152,6 +195,7 @@ static struct pid_entry tgid_base_stuff[] = {
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_MOUNTS,    "mounts",  S_IFREG|S_IRUGO),
+	E(PROC_TGID_SMAPS,     "smaps",   S_IFREG|S_IRUGO),
 #ifdef CONFIG_SECURITY
 	E(PROC_TGID_ATTR,      "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
@@ -180,6 +224,9 @@ static struct pid_entry tid_base_stuff[] = {
 	E(PROC_TID_STAT,       "stat",    S_IFREG|S_IRUGO),
 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
+#ifdef CONFIG_NUMA
+	E(PROC_TID_NUMA_MAPS,  "numa_maps",    S_IFREG|S_IRUGO),
+#endif
 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
 #ifdef CONFIG_SECCOMP
 	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
@@ -188,6 +235,7 @@ static struct pid_entry tid_base_stuff[] = {
 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_MOUNTS,     "mounts",  S_IFREG|S_IRUGO),
+	E(PROC_TID_SMAPS,      "smaps",   S_IFREG|S_IRUGO),
 #ifdef CONFIG_SECURITY
 	E(PROC_TID_ATTR,       "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
@@ -236,30 +284,36 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 
 	files = get_files_struct(task);
 	if (files) {
-		spin_lock(&files->file_lock);
+		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
 			*mnt = mntget(file->f_vfsmnt);
 			*dentry = dget(file->f_dentry);
-			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
 			put_files_struct(files);
 			return 0;
 		}
-		spin_unlock(&files->file_lock);
+		rcu_read_unlock();
 		put_files_struct(files);
 	}
 	return -ENOENT;
 }
 
-static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static struct fs_struct *get_fs_struct(struct task_struct *task)
 {
 	struct fs_struct *fs;
-	int result = -ENOENT;
-	task_lock(proc_task(inode));
-	fs = proc_task(inode)->fs;
+	task_lock(task);
+	fs = task->fs;
 	if(fs)
 		atomic_inc(&fs->count);
-	task_unlock(proc_task(inode));
+	task_unlock(task);
+	return fs;
+}
+
+static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+{
+	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	int result = -ENOENT;
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->pwdmnt);
@@ -273,13 +327,8 @@ static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfs
 
 static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs;
+	struct fs_struct *fs = get_fs_struct(proc_task(inode));
 	int result = -ENOENT;
-	task_lock(proc_task(inode));
-	fs = proc_task(inode)->fs;
-	if(fs)
-		atomic_inc(&fs->count);
-	task_unlock(proc_task(inode));
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->rootmnt);
@@ -298,33 +347,6 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	 (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
 	 security_ptrace(current,task) == 0))
 
-static int may_ptrace_attach(struct task_struct *task)
-{
-	int retval = 0;
-
-	task_lock(task);
-
-	if (!task->mm)
-		goto out;
-	if (((current->uid != task->euid) ||
-	     (current->uid != task->suid) ||
-	     (current->uid != task->uid) ||
-	     (current->gid != task->egid) ||
-	     (current->gid != task->sgid) ||
-	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		goto out;
-	rmb();
-	if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE))
-		goto out;
-	if (security_ptrace(current, task))
-		goto out;
-
-	retval = 1;
-out:
-	task_unlock(task);
-	return retval;
-}
-
 static int proc_pid_environ(struct task_struct *task, char * buffer)
 {
 	int res = 0;
@@ -334,7 +356,7 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		res = access_process_vm(task, mm->env_start, buffer, len, 0);
-		if (!may_ptrace_attach(task))
+		if (!ptrace_may_attach(task))
 			res = -ESRCH;
 		mmput(mm);
 	}
@@ -515,6 +537,46 @@ static struct file_operations proc_maps_operations = {
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_NUMA
+extern struct seq_operations proc_pid_numa_maps_op;
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+	struct task_struct *task = proc_task(inode);
+	int ret = seq_open(file, &proc_pid_numa_maps_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static struct file_operations proc_numa_maps_operations = {
+	.open		= numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)
+{
+	struct task_struct *task = proc_task(inode);
+	int ret = seq_open(file, &proc_pid_smaps_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static struct file_operations proc_smaps_operations = {
+	.open		= smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 extern struct seq_operations mounts_op;
 static int mounts_open(struct inode *inode, struct file *file)
 {
@@ -597,7 +659,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
-	if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
 	ret = -ENOMEM;
@@ -623,7 +685,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 
 		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 		retval = access_process_vm(task, src, page, this_len, 0);
-		if (!retval || !MAY_PTRACE(task) || !may_ptrace_attach(task)) {
+		if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
 			if (!ret)
 				ret = -EIO;
 			break;
@@ -661,7 +723,7 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
-	if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		return -ESRCH;
 
 	page = (char *)__get_free_page(GFP_USER);
@@ -978,6 +1040,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 	int retval;
 	char buf[NUMBUF];
 	struct files_struct * files;
+	struct fdtable *fdt;
 
 	retval = -ENOENT;
 	if (!pid_alive(p))
@@ -1000,15 +1063,16 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			files = get_files_struct(p);
 			if (!files)
 				goto out;
-			spin_lock(&files->file_lock);
+			rcu_read_lock();
+			fdt = files_fdtable(files);
 			for (fd = filp->f_pos-2;
-			     fd < files->max_fds;
+			     fd < fdt->max_fds;
 			     fd++, filp->f_pos++) {
 				unsigned int i,j;
 
 				if (!fcheck_files(files, fd))
 					continue;
-				spin_unlock(&files->file_lock);
+				rcu_read_unlock();
 
 				j = NUMBUF;
 				i = fd;
@@ -1020,12 +1084,12 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 
 				ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
 				if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
-					spin_lock(&files->file_lock);
+					rcu_read_lock();
 					break;
 				}
-				spin_lock(&files->file_lock);
+				rcu_read_lock();
 			}
-			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
 			put_files_struct(files);
 	}
 out:
@@ -1200,9 +1264,9 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 	files = get_files_struct(task);
 	if (files) {
-		spin_lock(&files->file_lock);
+		rcu_read_lock();
 		if (fcheck_files(files, fd)) {
-			spin_unlock(&files->file_lock);
+			rcu_read_unlock();
 			put_files_struct(files);
 			if (task_dumpable(task)) {
 				inode->i_uid = task->euid;
@@ -1214,7 +1278,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 			security_task_to_inode(task, inode);
 			return 1;
 		}
-		spin_unlock(&files->file_lock);
+		rcu_read_unlock();
 		put_files_struct(files);
 	}
 	d_drop(dentry);
@@ -1306,7 +1370,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!files)
 		goto out_unlock;
 	inode->i_mode = S_IFLNK;
-	spin_lock(&files->file_lock);
+	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock2;
@@ -1314,7 +1378,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 		inode->i_mode |= S_IRUSR | S_IXUSR;
 	if (file->f_mode & 2)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock();
 	put_files_struct(files);
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
@@ -1324,7 +1388,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	return NULL;
 
 out_unlock2:
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock();
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
@@ -1524,6 +1588,12 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_MAPS:
 			inode->i_fop = &proc_maps_operations;
 			break;
+#ifdef CONFIG_NUMA
+		case PROC_TID_NUMA_MAPS:
+		case PROC_TGID_NUMA_MAPS:
+			inode->i_fop = &proc_numa_maps_operations;
+			break;
+#endif
 		case PROC_TID_MEM:
 		case PROC_TGID_MEM:
 			inode->i_op = &proc_mem_inode_operations;
@@ -1539,6 +1609,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_MOUNTS:
 			inode->i_fop = &proc_mounts_operations;
 			break;
+		case PROC_TID_SMAPS:
+		case PROC_TGID_SMAPS:
+			inode->i_fop = &proc_smaps_operations;
+			break;
 #ifdef CONFIG_SECURITY
 		case PROC_TID_ATTR:
 			inode->i_nlink = 2;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index abe8920313f..8a8c34461d4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -249,6 +249,18 @@ out:
 	return error;
 }
 
+static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct proc_dir_entry *de = PROC_I(inode)->pde;
+	if (de && de->nlink)
+		inode->i_nlink = de->nlink;
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
 static struct inode_operations proc_file_inode_operations = {
 	.setattr	= proc_notify_change,
 };
@@ -475,6 +487,7 @@ static struct file_operations proc_dir_operations = {
  */
 static struct inode_operations proc_dir_inode_operations = {
 	.lookup		= proc_lookup,
+	.getattr	= proc_getattr,
 	.setattr	= proc_notify_change,
 };
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 133c2868510..effa6c0c467 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -60,6 +60,8 @@ static void proc_delete_inode(struct inode *inode)
 	struct proc_dir_entry *de;
 	struct task_struct *tsk;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	/* Let go of any associated process */
 	tsk = PROC_I(inode)->task;
 	if (tsk)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 28b4a0253a9..c7ef3e48e35 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2,8 +2,13 @@
 #include <linux/hugetlb.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+
 #include <asm/elf.h>
 #include <asm/uaccess.h>
+#include <asm/tlbflush.h>
 #include "internal.h"
 
 char *task_mem(struct mm_struct *mm, char *buffer)
@@ -87,49 +92,58 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
-static int show_map(struct seq_file *m, void *v)
+struct mem_size_stats
+{
+	unsigned long resident;
+	unsigned long shared_clean;
+	unsigned long shared_dirty;
+	unsigned long private_clean;
+	unsigned long private_dirty;
+};
+
+static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct task_struct *task = m->private;
-	struct vm_area_struct *map = v;
-	struct mm_struct *mm = map->vm_mm;
-	struct file *file = map->vm_file;
-	int flags = map->vm_flags;
+	struct vm_area_struct *vma = v;
+	struct mm_struct *mm = vma->vm_mm;
+	struct file *file = vma->vm_file;
+	int flags = vma->vm_flags;
 	unsigned long ino = 0;
 	dev_t dev = 0;
 	int len;
 
 	if (file) {
-		struct inode *inode = map->vm_file->f_dentry->d_inode;
+		struct inode *inode = vma->vm_file->f_dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
 
 	seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
-			map->vm_start,
-			map->vm_end,
+			vma->vm_start,
+			vma->vm_end,
 			flags & VM_READ ? 'r' : '-',
 			flags & VM_WRITE ? 'w' : '-',
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
-			map->vm_pgoff << PAGE_SHIFT,
+			vma->vm_pgoff << PAGE_SHIFT,
 			MAJOR(dev), MINOR(dev), ino, &len);
 
 	/*
 	 * Print the dentry name for named mappings, and a
 	 * special [heap] marker for the heap:
 	 */
-	if (map->vm_file) {
+	if (file) {
 		pad_len_spaces(m, len);
-		seq_path(m, file->f_vfsmnt, file->f_dentry, "");
+		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
 	} else {
 		if (mm) {
-			if (map->vm_start <= mm->start_brk &&
-						map->vm_end >= mm->brk) {
+			if (vma->vm_start <= mm->start_brk &&
+						vma->vm_end >= mm->brk) {
 				pad_len_spaces(m, len);
 				seq_puts(m, "[heap]");
 			} else {
-				if (map->vm_start <= mm->start_stack &&
-					map->vm_end >= mm->start_stack) {
+				if (vma->vm_start <= mm->start_stack &&
+					vma->vm_end >= mm->start_stack) {
 
 					pad_len_spaces(m, len);
 					seq_puts(m, "[stack]");
@@ -141,24 +155,146 @@ static int show_map(struct seq_file *m, void *v)
 		}
 	}
 	seq_putc(m, '\n');
-	if (m->count < m->size)  /* map is copied successfully */
-		m->version = (map != get_gate_vma(task))? map->vm_start: 0;
+
+	if (mss)
+		seq_printf(m,
+			   "Size:          %8lu kB\n"
+			   "Rss:           %8lu kB\n"
+			   "Shared_Clean:  %8lu kB\n"
+			   "Shared_Dirty:  %8lu kB\n"
+			   "Private_Clean: %8lu kB\n"
+			   "Private_Dirty: %8lu kB\n",
+			   (vma->vm_end - vma->vm_start) >> 10,
+			   mss->resident >> 10,
+			   mss->shared_clean  >> 10,
+			   mss->shared_dirty  >> 10,
+			   mss->private_clean >> 10,
+			   mss->private_dirty >> 10);
+
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
 	return 0;
 }
 
+static int show_map(struct seq_file *m, void *v)
+{
+	return show_map_internal(m, v, 0);
+}
+
+static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				unsigned long addr, unsigned long end,
+				struct mem_size_stats *mss)
+{
+	pte_t *pte, ptent;
+	unsigned long pfn;
+	struct page *page;
+
+	pte = pte_offset_map(pmd, addr);
+	do {
+		ptent = *pte;
+		if (pte_none(ptent) || !pte_present(ptent))
+			continue;
+
+		mss->resident += PAGE_SIZE;
+		pfn = pte_pfn(ptent);
+		if (!pfn_valid(pfn))
+			continue;
+
+		page = pfn_to_page(pfn);
+		if (page_count(page) >= 2) {
+			if (pte_dirty(ptent))
+				mss->shared_dirty += PAGE_SIZE;
+			else
+				mss->shared_clean += PAGE_SIZE;
+		} else {
+			if (pte_dirty(ptent))
+				mss->private_dirty += PAGE_SIZE;
+			else
+				mss->private_clean += PAGE_SIZE;
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(pte - 1);
+	cond_resched_lock(&vma->vm_mm->page_table_lock);
+}
+
+static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+				unsigned long addr, unsigned long end,
+				struct mem_size_stats *mss)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			continue;
+		smaps_pte_range(vma, pmd, addr, next, mss);
+	} while (pmd++, addr = next, addr != end);
+}
+
+static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+				unsigned long addr, unsigned long end,
+				struct mem_size_stats *mss)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		smaps_pmd_range(vma, pud, addr, next, mss);
+	} while (pud++, addr = next, addr != end);
+}
+
+static inline void smaps_pgd_range(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end,
+				struct mem_size_stats *mss)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		smaps_pud_range(vma, pgd, addr, next, mss);
+	} while (pgd++, addr = next, addr != end);
+}
+
+static int show_smap(struct seq_file *m, void *v)
+{
+	struct vm_area_struct *vma = v;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mem_size_stats mss;
+
+	memset(&mss, 0, sizeof mss);
+
+	if (mm) {
+		spin_lock(&mm->page_table_lock);
+		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+		spin_unlock(&mm->page_table_lock);
+	}
+
+	return show_map_internal(m, v, &mss);
+}
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct task_struct *task = m->private;
 	unsigned long last_addr = m->version;
 	struct mm_struct *mm;
-	struct vm_area_struct *map, *tail_map;
+	struct vm_area_struct *vma, *tail_vma;
 	loff_t l = *pos;
 
 	/*
 	 * We remember last_addr rather than next_addr to hit with
 	 * mmap_cache most of the time. We have zero last_addr at
-	 * the begining and also after lseek. We will have -1 last_addr
-	 * after the end of the maps.
+	 * the beginning and also after lseek. We will have -1 last_addr
+	 * after the end of the vmas.
 	 */
 
 	if (last_addr == -1UL)
@@ -168,47 +304,47 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (!mm)
 		return NULL;
 
-	tail_map = get_gate_vma(task);
+	tail_vma = get_gate_vma(task);
 	down_read(&mm->mmap_sem);
 
 	/* Start with last addr hint */
-	if (last_addr && (map = find_vma(mm, last_addr))) {
-		map = map->vm_next;
+	if (last_addr && (vma = find_vma(mm, last_addr))) {
+		vma = vma->vm_next;
 		goto out;
 	}
 
 	/*
-	 * Check the map index is within the range and do
+	 * Check the vma index is within the range and do
 	 * sequential scan until m_index.
 	 */
-	map = NULL;
+	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
-		map = mm->mmap;
-		while (l-- && map)
-			map = map->vm_next;
+		vma = mm->mmap;
+		while (l-- && vma)
+			vma = vma->vm_next;
 		goto out;
 	}
 
 	if (l != mm->map_count)
-		tail_map = NULL; /* After gate map */
+		tail_vma = NULL; /* After gate vma */
 
 out:
-	if (map)
-		return map;
+	if (vma)
+		return vma;
 
-	/* End of maps has reached */
-	m->version = (tail_map != NULL)? 0: -1UL;
+	/* End of vmas has been reached */
+	m->version = (tail_vma != NULL)? 0: -1UL;
 	up_read(&mm->mmap_sem);
 	mmput(mm);
-	return tail_map;
+	return tail_vma;
 }
 
 static void m_stop(struct seq_file *m, void *v)
 {
 	struct task_struct *task = m->private;
-	struct vm_area_struct *map = v;
-	if (map && map != get_gate_vma(task)) {
-		struct mm_struct *mm = map->vm_mm;
+	struct vm_area_struct *vma = v;
+	if (vma && vma != get_gate_vma(task)) {
+		struct mm_struct *mm = vma->vm_mm;
 		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
@@ -217,14 +353,14 @@ static void m_stop(struct seq_file *m, void *v)
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct task_struct *task = m->private;
-	struct vm_area_struct *map = v;
-	struct vm_area_struct *tail_map = get_gate_vma(task);
+	struct vm_area_struct *vma = v;
+	struct vm_area_struct *tail_vma = get_gate_vma(task);
 
 	(*pos)++;
-	if (map && (map != tail_map) && map->vm_next)
-		return map->vm_next;
+	if (vma && (vma != tail_vma) && vma->vm_next)
+		return vma->vm_next;
 	m_stop(m, v);
-	return (map != tail_map)? tail_map: NULL;
+	return (vma != tail_vma)? tail_vma: NULL;
 }
 
 struct seq_operations proc_pid_maps_op = {
@@ -233,3 +369,140 @@ struct seq_operations proc_pid_maps_op = {
 	.stop	= m_stop,
 	.show	= show_map
 };
+
+struct seq_operations proc_pid_smaps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_smap
+};
+
+#ifdef CONFIG_NUMA
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long mapped;
+	unsigned long mapcount_max;
+	unsigned long node[MAX_NUMNODES];
+};
+
+/*
+ * Calculate numa node maps for a vma
+ */
+static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
+{
+	struct page *page;
+	unsigned long vaddr;
+	struct mm_struct *mm = vma->vm_mm;
+	int i;
+	struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
+
+	if (!md)
+		return NULL;
+	md->pages = 0;
+	md->anon = 0;
+	md->mapped = 0;
+	md->mapcount_max = 0;
+	for_each_node(i)
+		md->node[i] =0;
+
+	spin_lock(&mm->page_table_lock);
+ 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
+		page = follow_page(mm, vaddr, 0);
+		if (page) {
+			int count = page_mapcount(page);
+
+			if (count)
+				md->mapped++;
+			if (count > md->mapcount_max)
+				md->mapcount_max = count;
+			md->pages++;
+			if (PageAnon(page))
+				md->anon++;
+			md->node[page_to_nid(page)]++;
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+	return md;
+}
+
+static int show_numa_map(struct seq_file *m, void *v)
+{
+	struct task_struct *task = m->private;
+	struct vm_area_struct *vma = v;
+	struct mempolicy *pol;
+	struct numa_maps *md;
+	struct zone **z;
+	int n;
+	int first;
+
+	if (!vma->vm_mm)
+		return 0;
+
+	md = get_numa_maps(vma);
+	if (!md)
+		return 0;
+
+	seq_printf(m, "%08lx", vma->vm_start);
+	pol = get_vma_policy(task, vma, vma->vm_start);
+	/* Print policy */
+	switch (pol->policy) {
+	case MPOL_PREFERRED:
+		seq_printf(m, " prefer=%d", pol->v.preferred_node);
+		break;
+	case MPOL_BIND:
+		seq_printf(m, " bind={");
+		first = 1;
+		for (z = pol->v.zonelist->zones; *z; z++) {
+
+			if (!first)
+				seq_putc(m, ',');
+			else
+				first = 0;
+			seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
+					(*z)->name);
+		}
+		seq_putc(m, '}');
+		break;
+	case MPOL_INTERLEAVE:
+		seq_printf(m, " interleave={");
+		first = 1;
+		for_each_node(n) {
+			if (test_bit(n, pol->v.nodes)) {
+				if (!first)
+					seq_putc(m,',');
+				else
+					first = 0;
+				seq_printf(m, "%d",n);
+			}
+		}
+		seq_putc(m, '}');
+		break;
+	default:
+		seq_printf(m," default");
+		break;
+	}
+	seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
+			md->mapcount_max, md->pages, md->mapped);
+	if (md->anon)
+		seq_printf(m," Anon=%lu",md->anon);
+
+	for_each_online_node(n) {
+		if (md->node[n])
+			seq_printf(m, " N%d=%lu", n, md->node[n]);
+	}
+	seq_putc(m, '\n');
+	kfree(md);
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
+}
+
+struct seq_operations proc_pid_numa_maps_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_numa_map
+};
+#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b79162a3547..80f32911c0c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -63,6 +63,7 @@ int qnx4_sync_inode(struct inode *inode)
 static void qnx4_delete_inode(struct inode *inode)
 {
 	QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	qnx4_truncate(inode);
 	lock_kernel();
diff --git a/fs/read_write.c b/fs/read_write.c
index 563abd09b5c..b60324aaa2b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -188,7 +188,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	struct inode *inode;
 	loff_t pos;
 
-	if (unlikely(count > file->f_maxcount))
+	if (unlikely(count > INT_MAX))
 		goto Einval;
 	pos = *ppos;
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ff291c973a5..1a8a1bf2154 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -33,6 +33,8 @@ void reiserfs_delete_inode(struct inode *inode)
 	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
 	struct reiserfs_transaction_handle th;
 
+	truncate_inode_pages(&inode->i_data, 0);
+
 	reiserfs_write_lock(inode->i_sb);
 
 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ca7989b04be..a8e29e9bbbd 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1034,7 +1034,7 @@ static int flush_commit_list(struct super_block *s,
 		    SB_ONDISK_JOURNAL_SIZE(s);
 		tbh = journal_find_get_block(s, bn);
 		if (buffer_dirty(tbh))	/* redundant, ll_rw_block() checks */
-			ll_rw_block(WRITE, 1, &tbh);
+			ll_rw_block(SWRITE, 1, &tbh);
 		put_bh(tbh);
 	}
 	atomic_dec(&journal->j_async_throttle);
@@ -2172,7 +2172,7 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	/* flush out the real blocks */
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		set_buffer_dirty(real_blocks[i]);
-		ll_rw_block(WRITE, 1, real_blocks + i);
+		ll_rw_block(SWRITE, 1, real_blocks + i);
 	}
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
new file mode 100644
index 00000000000..e76e182cdb3
--- /dev/null
+++ b/fs/relayfs/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_RELAYFS_FS) += relayfs.o
+
+relayfs-y := relay.o inode.o buffers.o
+
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
new file mode 100644
index 00000000000..2aa8e271999
--- /dev/null
+++ b/fs/relayfs/buffers.c
@@ -0,0 +1,189 @@
+/*
+ * RelayFS buffer management code.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+
+/*
+ * close() vm_op implementation for relayfs file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = vma->vm_private_data;
+	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * nopage() vm_op implementation for relayfs file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+				     unsigned long address,
+				     int *type)
+{
+	struct page *page;
+	struct rchan_buf *buf = vma->vm_private_data;
+	unsigned long offset = address - vma->vm_start;
+
+	if (address > vma->vm_end)
+		return NOPAGE_SIGBUS; /* Disallow mremap */
+	if (!buf)
+		return NOPAGE_OOM;
+
+	page = vmalloc_to_page(buf->start + offset);
+	if (!page)
+		return NOPAGE_OOM;
+	get_page(page);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+	.nopage = relay_buf_nopage,
+	.close = relay_file_mmap_close,
+};
+
+/**
+ *	relay_mmap_buf: - mmap channel buffer to process address space
+ *	@buf: relay channel buffer
+ *	@vma: vm_area_struct describing memory to be mapped
+ *
+ *	Returns 0 if ok, negative on error
+ *
+ *	Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+	unsigned long length = vma->vm_end - vma->vm_start;
+	struct file *filp = vma->vm_file;
+
+	if (!buf)
+		return -EBADF;
+
+	if (length != (unsigned long)buf->chan->alloc_size)
+		return -EINVAL;
+
+	vma->vm_ops = &relay_file_mmap_ops;
+	vma->vm_private_data = buf;
+	buf->chan->cb->buf_mapped(buf, filp);
+
+	return 0;
+}
+
+/**
+ *	relay_alloc_buf - allocate a channel buffer
+ *	@buf: the buffer struct
+ *	@size: total size of the buffer
+ *
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+{
+	void *mem;
+	unsigned int i, j, n_pages;
+
+	size = PAGE_ALIGN(size);
+	n_pages = size >> PAGE_SHIFT;
+
+	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!buf->page_array)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		buf->page_array[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!buf->page_array[i]))
+			goto depopulate;
+	}
+	mem = vmap(buf->page_array, n_pages, GFP_KERNEL, PAGE_KERNEL);
+	if (!mem)
+		goto depopulate;
+
+	memset(mem, 0, size);
+	buf->page_count = n_pages;
+	return mem;
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(buf->page_array[j]);
+	kfree(buf->page_array);
+	return NULL;
+}
+
+/**
+ *	relay_create_buf - allocate and initialize a channel buffer
+ *	@alloc_size: size of the buffer to allocate
+ *	@n_subbufs: number of sub-buffers in the channel
+ *
+ *	Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+	if (!buf->padding)
+		goto free_buf;
+
+	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	if (!buf->start)
+		goto free_buf;
+
+	buf->chan = chan;
+	kref_get(&buf->chan->kref);
+	return buf;
+
+free_buf:
+	kfree(buf->padding);
+	kfree(buf);
+	return NULL;
+}
+
+/**
+ *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *	@buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+	struct rchan *chan = buf->chan;
+	unsigned int i;
+
+	if (likely(buf->start)) {
+		vunmap(buf->start);
+		for (i = 0; i < buf->page_count; i++)
+			__free_page(buf->page_array[i]);
+		kfree(buf->page_array);
+	}
+	kfree(buf->padding);
+	kfree(buf);
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_remove_buf - remove a channel buffer
+ *
+ *	Removes the file from the relayfs fileystem, which also frees the
+ *	rchan_buf_struct and the channel buffer.  Should only be called from
+ *	kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+	relayfs_remove(buf->dentry);
+}
diff --git a/fs/relayfs/buffers.h b/fs/relayfs/buffers.h
new file mode 100644
index 00000000000..37a12493f64
--- /dev/null
+++ b/fs/relayfs/buffers.h
@@ -0,0 +1,12 @@
+#ifndef _BUFFERS_H
+#define _BUFFERS_H
+
+/* This inspired by rtai/shmem */
+#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
+
+extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
+extern struct rchan_buf *relay_create_buf(struct rchan *chan);
+extern void relay_destroy_buf(struct rchan_buf *buf);
+extern void relay_remove_buf(struct kref *kref);
+
+#endif/* _BUFFERS_H */
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
new file mode 100644
index 00000000000..0f7f88d067a
--- /dev/null
+++ b/fs/relayfs/inode.c
@@ -0,0 +1,609 @@
+/*
+ * VFS-related code for RelayFS, a high-speed data relay filesystem.
+ *
+ * Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
+ * Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
+ *
+ * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+
+#define RELAYFS_MAGIC			0xF0B4A981
+
+static struct vfsmount *		relayfs_mount;
+static int				relayfs_mount_count;
+static kmem_cache_t *			relayfs_inode_cachep;
+
+static struct backing_dev_info		relayfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+				       struct rchan *chan)
+{
+	struct rchan_buf *buf = NULL;
+	struct inode *inode;
+
+	if (S_ISREG(mode)) {
+		BUG_ON(!chan);
+		buf = relay_create_buf(chan);
+		if (!buf)
+			return NULL;
+	}
+
+	inode = new_inode(sb);
+	if (!inode) {
+		relay_destroy_buf(buf);
+		return NULL;
+	}
+
+	inode->i_mode = mode;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_fop = &relayfs_file_operations;
+		RELAYFS_I(inode)->buf = buf;
+		break;
+	case S_IFDIR:
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+		break;
+	default:
+		break;
+	}
+
+	return inode;
+}
+
+/**
+ *	relayfs_create_entry - create a relayfs directory or file
+ *	@name: the name of the file to create
+ *	@parent: parent directory
+ *	@mode: mode
+ *	@chan: relay channel associated with the file
+ *
+ *	Returns the new dentry, NULL on failure
+ *
+ *	Creates a file or directory with the specifed permissions.
+ */
+static struct dentry *relayfs_create_entry(const char *name,
+					   struct dentry *parent,
+					   int mode,
+					   struct rchan *chan)
+{
+	struct dentry *d;
+	struct inode *inode;
+	int error = 0;
+
+	BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
+
+	error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
+	if (error) {
+		printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
+		return NULL;
+	}
+
+	if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
+		parent = relayfs_mount->mnt_sb->s_root;
+
+	if (!parent) {
+		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+		return NULL;
+	}
+
+	parent = dget(parent);
+	down(&parent->d_inode->i_sem);
+	d = lookup_one_len(name, parent, strlen(name));
+	if (IS_ERR(d)) {
+		d = NULL;
+		goto release_mount;
+	}
+
+	if (d->d_inode) {
+		d = NULL;
+		goto release_mount;
+	}
+
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan);
+	if (!inode) {
+		d = NULL;
+		goto release_mount;
+	}
+
+	d_instantiate(d, inode);
+	dget(d);	/* Extra count - pin the dentry in core */
+
+	if (S_ISDIR(mode))
+		parent->d_inode->i_nlink++;
+
+	goto exit;
+
+release_mount:
+	simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+
+exit:
+	up(&parent->d_inode->i_sem);
+	dput(parent);
+	return d;
+}
+
+/**
+ *	relayfs_create_file - create a file in the relay filesystem
+ *	@name: the name of the file to create
+ *	@parent: parent directory
+ *	@mode: mode, if not specied the default perms are used
+ *	@chan: channel associated with the file
+ *
+ *	Returns file dentry if successful, NULL otherwise.
+ *
+ *	The file will be created user r on behalf of current user.
+ */
+struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
+				   int mode, struct rchan *chan)
+{
+	if (!mode)
+		mode = S_IRUSR;
+	mode = (mode & S_IALLUGO) | S_IFREG;
+
+	return relayfs_create_entry(name, parent, mode, chan);
+}
+
+/**
+ *	relayfs_create_dir - create a directory in the relay filesystem
+ *	@name: the name of the directory to create
+ *	@parent: parent directory, NULL if parent should be fs root
+ *
+ *	Returns directory dentry if successful, NULL otherwise.
+ *
+ *	The directory will be created world rwx on behalf of current user.
+ */
+struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
+{
+	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	return relayfs_create_entry(name, parent, mode, NULL);
+}
+
+/**
+ *	relayfs_remove - remove a file or directory in the relay filesystem
+ *	@dentry: file or directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove(struct dentry *dentry)
+{
+	struct dentry *parent;
+	int error = 0;
+
+	if (!dentry)
+		return -EINVAL;
+	parent = dentry->d_parent;
+	if (!parent)
+		return -EINVAL;
+
+	parent = dget(parent);
+	down(&parent->d_inode->i_sem);
+	if (dentry->d_inode) {
+		if (S_ISDIR(dentry->d_inode->i_mode))
+			error = simple_rmdir(parent->d_inode, dentry);
+		else
+			error = simple_unlink(parent->d_inode, dentry);
+		if (!error)
+			d_delete(dentry);
+	}
+	if (!error)
+		dput(dentry);
+	up(&parent->d_inode->i_sem);
+	dput(parent);
+
+	if (!error)
+		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+
+	return error;
+}
+
+/**
+ *	relayfs_remove_dir - remove a directory in the relay filesystem
+ *	@dentry: directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_dir(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
+/**
+ *	relayfs_open - open file op for relayfs files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Increments the channel buffer refcount.
+ */
+static int relayfs_open(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	kref_get(&buf->kref);
+
+	return 0;
+}
+
+/**
+ *	relayfs_mmap - mmap file op for relayfs files
+ *	@filp: the file
+ *	@vma: the vma describing what to map
+ *
+ *	Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+}
+
+/**
+ *	relayfs_poll - poll file op for relayfs files
+ *	@filp: the file
+ *	@wait: poll table
+ *
+ *	Poll implemention.
+ */
+static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask = 0;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+
+	if (buf->finalized)
+		return POLLERR;
+
+	if (filp->f_mode & FMODE_READ) {
+		poll_wait(filp, &buf->read_wait, wait);
+		if (!relay_buf_empty(buf))
+			mask |= POLLIN | POLLRDNORM;
+	}
+
+	return mask;
+}
+
+/**
+ *	relayfs_release - release file op for relayfs files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Decrements the channel refcount, as the filesystem is
+ *	no longer using it.
+ */
+static int relayfs_release(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	kref_put(&buf->kref, relay_remove_buf);
+
+	return 0;
+}
+
+/**
+ *	relayfs_read_consume - update the consumed count for the buffer
+ */
+static void relayfs_read_consume(struct rchan_buf *buf,
+				 size_t read_pos,
+				 size_t bytes_consumed)
+{
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t read_subbuf;
+
+	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+
+	buf->bytes_consumed += bytes_consumed;
+	read_subbuf = read_pos / buf->chan->subbuf_size;
+	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+		    (buf->offset == subbuf_size))
+			return;
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+}
+
+/**
+ *	relayfs_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+	size_t bytes_produced, bytes_consumed, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t produced = buf->subbufs_produced % n_subbufs;
+	size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+
+	if (consumed > produced) {
+		if ((produced > n_subbufs) &&
+		    (produced + n_subbufs - consumed <= n_subbufs))
+			produced += n_subbufs;
+	} else if (consumed == produced) {
+		if (buf->offset > subbuf_size) {
+			produced += n_subbufs;
+			if (buf->subbufs_produced == buf->subbufs_consumed)
+				consumed += n_subbufs;
+		}
+	}
+
+	if (buf->offset > subbuf_size)
+		bytes_produced = (produced - 1) * subbuf_size + write_offset;
+	else
+		bytes_produced = produced * subbuf_size + write_offset;
+	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
+
+	if (bytes_produced == bytes_consumed)
+		return 0;
+
+	relayfs_read_consume(buf, read_pos, 0);
+
+	return 1;
+}
+
+/**
+ *	relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relayfs_read_subbuf_avail(size_t read_pos,
+					struct rchan_buf *buf)
+{
+	size_t padding, avail = 0;
+	size_t read_subbuf, read_offset, write_subbuf, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+
+	write_subbuf = (buf->data - buf->start) / subbuf_size;
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	read_subbuf = read_pos / subbuf_size;
+	read_offset = read_pos % subbuf_size;
+	padding = buf->padding[read_subbuf];
+
+	if (read_subbuf == write_subbuf) {
+		if (read_offset + padding < write_offset)
+			avail = write_offset - (read_offset + padding);
+	} else
+		avail = (subbuf_size - padding) - read_offset;
+
+	return avail;
+}
+
+/**
+ *	relayfs_read_start_pos - find the first available byte to read
+ *
+ *	If the read_pos is in the middle of padding, return the
+ *	position of the first actually available byte, otherwise
+ *	return the original value.
+ */
+static size_t relayfs_read_start_pos(size_t read_pos,
+				     struct rchan_buf *buf)
+{
+	size_t read_subbuf, padding, padding_start, padding_end;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	padding_start = (read_subbuf + 1) * subbuf_size - padding;
+	padding_end = (read_subbuf + 1) * subbuf_size;
+	if (read_pos >= padding_start && read_pos < padding_end) {
+		read_subbuf = (read_subbuf + 1) % n_subbufs;
+		read_pos = read_subbuf * subbuf_size;
+	}
+
+	return read_pos;
+}
+
+/**
+ *	relayfs_read_end_pos - return the new read position
+ */
+static size_t relayfs_read_end_pos(struct rchan_buf *buf,
+				   size_t read_pos,
+				   size_t count)
+{
+	size_t read_subbuf, padding, end_pos;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	if (read_pos % subbuf_size + count + padding == subbuf_size)
+		end_pos = (read_subbuf + 1) * subbuf_size;
+	else
+		end_pos = read_pos + count;
+	if (end_pos >= subbuf_size * n_subbufs)
+		end_pos = 0;
+
+	return end_pos;
+}
+
+/**
+ *	relayfs_read - read file op for relayfs files
+ *	@filp: the file
+ *	@buffer: the userspace buffer
+ *	@count: number of bytes to read
+ *	@ppos: position to read from
+ *
+ *	Reads count bytes or the number of bytes available in the
+ *	current sub-buffer being read, whichever is smaller.
+ */
+static ssize_t relayfs_read(struct file *filp,
+			    char __user *buffer,
+			    size_t count,
+			    loff_t *ppos)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	size_t read_start, avail;
+	ssize_t ret = 0;
+	void *from;
+
+	down(&inode->i_sem);
+	if(!relayfs_read_avail(buf, *ppos))
+		goto out;
+
+	read_start = relayfs_read_start_pos(*ppos, buf);
+	avail = relayfs_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		goto out;
+
+	from = buf->start + read_start;
+	ret = count = min(count, avail);
+	if (copy_to_user(buffer, from, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	relayfs_read_consume(buf, read_start, count);
+	*ppos = relayfs_read_end_pos(buf, read_start, count);
+out:
+	up(&inode->i_sem);
+	return ret;
+}
+
+/**
+ *	relayfs alloc_inode() implementation
+ */
+static struct inode *relayfs_alloc_inode(struct super_block *sb)
+{
+	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
+	if (!p)
+		return NULL;
+	p->buf = NULL;
+
+	return &p->vfs_inode;
+}
+
+/**
+ *	relayfs destroy_inode() implementation
+ */
+static void relayfs_destroy_inode(struct inode *inode)
+{
+	if (RELAYFS_I(inode)->buf)
+		relay_destroy_buf(RELAYFS_I(inode)->buf);
+
+	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
+}
+
+static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct relayfs_inode_info *i = p;
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&i->vfs_inode);
+}
+
+struct file_operations relayfs_file_operations = {
+	.open		= relayfs_open,
+	.poll		= relayfs_poll,
+	.mmap		= relayfs_mmap,
+	.read		= relayfs_read,
+	.llseek		= no_llseek,
+	.release	= relayfs_release,
+};
+
+static struct super_operations relayfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.alloc_inode	= relayfs_alloc_inode,
+	.destroy_inode	= relayfs_destroy_inode,
+};
+
+static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = RELAYFS_MAGIC;
+	sb->s_op = &relayfs_ops;
+	inode = relayfs_get_inode(sb, mode, NULL);
+
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+
+	return 0;
+}
+
+static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
+					   int flags, const char *dev_name,
+					   void *data)
+{
+	return get_sb_single(fs_type, flags, data, relayfs_fill_super);
+}
+
+static struct file_system_type relayfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "relayfs",
+	.get_sb		= relayfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_relayfs_fs(void)
+{
+	int err;
+
+	relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
+				sizeof(struct relayfs_inode_info), 0,
+				0, init_once, NULL);
+	if (!relayfs_inode_cachep)
+		return -ENOMEM;
+
+	err = register_filesystem(&relayfs_fs_type);
+	if (err)
+		kmem_cache_destroy(relayfs_inode_cachep);
+
+	return err;
+}
+
+static void __exit exit_relayfs_fs(void)
+{
+	unregister_filesystem(&relayfs_fs_type);
+	kmem_cache_destroy(relayfs_inode_cachep);
+}
+
+module_init(init_relayfs_fs)
+module_exit(exit_relayfs_fs)
+
+EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relayfs_create_dir);
+EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+
+MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
+MODULE_DESCRIPTION("Relay Filesystem");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
new file mode 100644
index 00000000000..16446a15c96
--- /dev/null
+++ b/fs/relayfs/relay.c
@@ -0,0 +1,431 @@
+/*
+ * Public API and common code for RelayFS.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+
+/**
+ *	relay_buf_empty - boolean, is the channel buffer empty?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+
+/**
+ *	relay_buf_full - boolean, is the channel buffer full?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+
+/*
+ * High-level relayfs kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+					  void *subbuf,
+					  void *prev_subbuf,
+					  size_t prev_padding)
+{
+	if (relay_buf_full(buf))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+					struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+					  struct file *filp)
+{
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+	.subbuf_start = subbuf_start_default_callback,
+	.buf_mapped = buf_mapped_default_callback,
+	.buf_unmapped = buf_unmapped_default_callback,
+};
+
+/**
+ *	wakeup_readers - wake up readers waiting on a channel
+ *	@private: the channel buffer
+ *
+ *	This is the work function used to defer reader waking.  The
+ *	reason waking is deferred is that calling directly from write
+ *	causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+	struct rchan_buf *buf = private;
+	wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ *	__relay_reset - reset a channel buffer
+ *	@buf: the channel buffer
+ *	@init: 1 if this is a first-time initialization
+ *
+ *	See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+	size_t i;
+
+	if (init) {
+		init_waitqueue_head(&buf->read_wait);
+		kref_init(&buf->kref);
+		INIT_WORK(&buf->wake_readers, NULL, NULL);
+	} else {
+		cancel_delayed_work(&buf->wake_readers);
+		flush_scheduled_work();
+	}
+
+	buf->subbufs_produced = 0;
+	buf->subbufs_consumed = 0;
+	buf->bytes_consumed = 0;
+	buf->finalized = 0;
+	buf->data = buf->start;
+	buf->offset = 0;
+
+	for (i = 0; i < buf->chan->n_subbufs; i++)
+		buf->padding[i] = 0;
+
+	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ *	relay_reset - reset the channel
+ *	@chan: the channel
+ *
+ *	This has the effect of erasing all data from all channel buffers
+ *	and restarting the channel in its initial state.  The buffers
+ *	are not freed, so any mappings are still in effect.
+ *
+ *	NOTE: Care should be taken that the channel isn't actually
+ *	being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+	unsigned int i;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			continue;
+		__relay_reset(chan->buf[i], 0);
+	}
+}
+
+/**
+ *	relay_open_buf - create a new channel buffer in relayfs
+ *
+ *	Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+					const char *filename,
+					struct dentry *parent)
+{
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+
+	/* Create file in fs */
+	dentry = relayfs_create_file(filename, parent, S_IRUSR, chan);
+	if (!dentry)
+		return NULL;
+
+	buf = RELAYFS_I(dentry->d_inode)->buf;
+	buf->dentry = dentry;
+	__relay_reset(buf, 1);
+
+	return buf;
+}
+
+/**
+ *	relay_close_buf - close a channel buffer
+ *	@buf: channel buffer
+ *
+ *	Marks the buffer finalized and restores the default callbacks.
+ *	The channel buffer and channel buffer data structure are then freed
+ *	automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+	buf->finalized = 1;
+	buf->chan->cb = &default_channel_callbacks;
+	cancel_delayed_work(&buf->wake_readers);
+	flush_scheduled_work();
+	kref_put(&buf->kref, relay_remove_buf);
+}
+
+static inline void setup_callbacks(struct rchan *chan,
+				   struct rchan_callbacks *cb)
+{
+	if (!cb) {
+		chan->cb = &default_channel_callbacks;
+		return;
+	}
+
+	if (!cb->subbuf_start)
+		cb->subbuf_start = subbuf_start_default_callback;
+	if (!cb->buf_mapped)
+		cb->buf_mapped = buf_mapped_default_callback;
+	if (!cb->buf_unmapped)
+		cb->buf_unmapped = buf_unmapped_default_callback;
+	chan->cb = cb;
+}
+
+/**
+ *	relay_open - create a new relayfs channel
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, NULL for root directory
+ *	@subbuf_size: size of sub-buffers
+ *	@n_subbufs: number of sub-buffers
+ *	@cb: client callback functions
+ *
+ *	Returns channel pointer if successful, NULL otherwise.
+ *
+ *	Creates a channel buffer for each cpu using the sizes and
+ *	attributes specified.  The created channel buffer files
+ *	will be named base_filename0...base_filenameN-1.  File
+ *	permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb)
+{
+	unsigned int i;
+	struct rchan *chan;
+	char *tmpname;
+
+	if (!base_filename)
+		return NULL;
+
+	if (!(subbuf_size && n_subbufs))
+		return NULL;
+
+	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+	if (!chan)
+		return NULL;
+
+	chan->version = RELAYFS_CHANNEL_VERSION;
+	chan->n_subbufs = n_subbufs;
+	chan->subbuf_size = subbuf_size;
+	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+	setup_callbacks(chan, cb);
+	kref_init(&chan->kref);
+
+	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		goto free_chan;
+
+	for_each_online_cpu(i) {
+		sprintf(tmpname, "%s%d", base_filename, i);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+		chan->buf[i]->cpu = i;
+		if (!chan->buf[i])
+			goto free_bufs;
+	}
+
+	kfree(tmpname);
+	return chan;
+
+free_bufs:
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			break;
+		relay_close_buf(chan->buf[i]);
+	}
+	kfree(tmpname);
+
+free_chan:
+	kref_put(&chan->kref, relay_destroy_channel);
+	return NULL;
+}
+
+/**
+ *	relay_switch_subbuf - switch to a new sub-buffer
+ *	@buf: channel buffer
+ *	@length: size of current event
+ *
+ *	Returns either the length passed in or 0 if full.
+
+ *	Performs sub-buffer-switch tasks such as invoking callbacks,
+ *	updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+	void *old, *new;
+	size_t old_subbuf, new_subbuf;
+
+	if (unlikely(length > buf->chan->subbuf_size))
+		goto toobig;
+
+	if (buf->offset != buf->chan->subbuf_size + 1) {
+		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->subbufs_produced++;
+		if (waitqueue_active(&buf->read_wait)) {
+			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+			schedule_delayed_work(&buf->wake_readers, 1);
+		}
+	}
+
+	old = buf->data;
+	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+	new = buf->start + new_subbuf * buf->chan->subbuf_size;
+	buf->offset = 0;
+	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+		buf->offset = buf->chan->subbuf_size + 1;
+		return 0;
+	}
+	buf->data = new;
+	buf->padding[new_subbuf] = 0;
+
+	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+		goto toobig;
+
+	return length;
+
+toobig:
+	printk(KERN_WARNING "relayfs: event too large (%Zd)\n", length);
+	WARN_ON(1);
+	return 0;
+}
+
+/**
+ *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *	@chan: the channel
+ *	@cpu: the cpu associated with the channel buffer to update
+ *	@subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *	Adds to the channel buffer's consumed sub-buffer count.
+ *	subbufs_consumed should be the number of sub-buffers newly consumed,
+ *	not the total consumed.
+ *
+ *	NOTE: kernel clients don't need to call this function if the channel
+ *	mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+			    unsigned int cpu,
+			    size_t subbufs_consumed)
+{
+	struct rchan_buf *buf;
+
+	if (!chan)
+		return;
+
+	if (cpu >= NR_CPUS || !chan->buf[cpu])
+		return;
+
+	buf = chan->buf[cpu];
+	buf->subbufs_consumed += subbufs_consumed;
+	if (buf->subbufs_consumed > buf->subbufs_produced)
+		buf->subbufs_consumed = buf->subbufs_produced;
+}
+
+/**
+ *	relay_destroy_channel - free the channel struct
+ *
+ *	Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+	struct rchan *chan = container_of(kref, struct rchan, kref);
+	kfree(chan);
+}
+
+/**
+ *	relay_close - close the channel
+ *	@chan: the channel
+ *
+ *	Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+	unsigned int i;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			continue;
+		relay_close_buf(chan->buf[i]);
+	}
+
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_flush - close the channel
+ *	@chan: the channel
+ *
+ *	Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+	unsigned int i;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			continue;
+		relay_switch_subbuf(chan->buf[i], 0);
+	}
+}
+
+EXPORT_SYMBOL_GPL(relay_open);
+EXPORT_SYMBOL_GPL(relay_close);
+EXPORT_SYMBOL_GPL(relay_flush);
+EXPORT_SYMBOL_GPL(relay_reset);
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+EXPORT_SYMBOL_GPL(relay_buf_full);
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
new file mode 100644
index 00000000000..703503fa22b
--- /dev/null
+++ b/fs/relayfs/relay.h
@@ -0,0 +1,12 @@
+#ifndef _RELAY_H
+#define _RELAY_H
+
+struct dentry *relayfs_create_file(const char *name,
+				   struct dentry *parent,
+				   int mode,
+				   struct rchan *chan);
+extern int relayfs_remove(struct dentry *dentry);
+extern int relay_buf_empty(struct rchan_buf *buf);
+extern void relay_destroy_channel(struct kref *kref);
+
+#endif /* _RELAY_H */
diff --git a/fs/select.c b/fs/select.c
index b80e7eb0ac0..f10a10317d5 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -22,6 +22,7 @@
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 
@@ -132,11 +133,13 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
 	unsigned long *open_fds;
 	unsigned long set;
 	int max;
+	struct fdtable *fdt;
 
 	/* handle last in-complete long-word first */
 	set = ~(~0UL << (n & (__NFDBITS-1)));
 	n /= __NFDBITS;
-	open_fds = current->files->open_fds->fds_bits+n;
+	fdt = files_fdtable(current->files);
+	open_fds = fdt->open_fds->fds_bits+n;
 	max = 0;
 	if (set) {
 		set &= BITS(fds, n);
@@ -183,9 +186,9 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 	int retval, i;
 	long __timeout = *timeout;
 
- 	spin_lock(&current->files->file_lock);
+	rcu_read_lock();
 	retval = max_select_fd(n, fds);
-	spin_unlock(&current->files->file_lock);
+	rcu_read_unlock();
 
 	if (retval < 0)
 		return retval;
@@ -299,6 +302,7 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 	char *bits;
 	long timeout;
 	int ret, size, max_fdset;
+	struct fdtable *fdt;
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -326,7 +330,10 @@ sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s
 		goto out_nofds;
 
 	/* max_fdset can increase, so grab it once to avoid race */
-	max_fdset = current->files->max_fdset;
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fdset = fdt->max_fdset;
+	rcu_read_unlock();
 	if (n > max_fdset)
 		n = max_fdset;
 
@@ -464,9 +471,15 @@ asmlinkage long sys_poll(struct pollfd __user * ufds, unsigned int nfds, long ti
  	unsigned int i;
 	struct poll_list *head;
  	struct poll_list *walk;
+	struct fdtable *fdt;
+	int max_fdset;
 
 	/* Do a sanity check on nfds ... */
-	if (nfds > current->files->max_fdset && nfds > OPEN_MAX)
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fdset = fdt->max_fdset;
+	rcu_read_unlock();
+	if (nfds > max_fdset && nfds > OPEN_MAX)
 		return -EINVAL;
 
 	if (timeout) {
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4765aaac9fd..10b994428fe 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -331,6 +331,7 @@ static void
 smb_delete_inode(struct inode *ino)
 {
 	DEBUG1("ino=%ld\n", ino->i_ino);
+	truncate_inode_pages(&ino->i_data, 0);
 	lock_kernel();
 	if (smb_close(ino))
 		PARANOIA("could not close inode %ld\n", ino->i_ino);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0530077d9dd..fa33eceb001 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -292,6 +292,7 @@ int sysv_sync_inode(struct inode * inode)
 
 static void sysv_delete_inode(struct inode *inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	sysv_truncate(inode);
 	lock_kernel();
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 3d68de39fad..b83890beaaa 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -87,6 +87,8 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
  */
 void udf_delete_inode(struct inode * inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
+
 	if (is_bad_inode(inode))
 		goto no_delete;
 
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 997640c99c7..faf1512173e 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,8 +114,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
 	ubh_mark_buffer_dirty (USPI_UBH);
 	ubh_mark_buffer_dirty (UCPI_UBH);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_wait_on_buffer (UCPI_UBH);
-		ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
 		ubh_wait_on_buffer (UCPI_UBH);
 	}
 	sb->s_dirt = 1;
@@ -200,8 +199,7 @@ do_more:
 	ubh_mark_buffer_dirty (USPI_UBH);
 	ubh_mark_buffer_dirty (UCPI_UBH);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_wait_on_buffer (UCPI_UBH);
-		ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
 		ubh_wait_on_buffer (UCPI_UBH);
 	}
 
@@ -459,8 +457,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	ubh_mark_buffer_dirty (USPI_UBH);
 	ubh_mark_buffer_dirty (UCPI_UBH);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_wait_on_buffer (UCPI_UBH);
-		ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
 		ubh_wait_on_buffer (UCPI_UBH);
 	}
 	sb->s_dirt = 1;
@@ -585,8 +582,7 @@ succed:
 	ubh_mark_buffer_dirty (USPI_UBH);
 	ubh_mark_buffer_dirty (UCPI_UBH);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_wait_on_buffer (UCPI_UBH);
-		ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
 		ubh_wait_on_buffer (UCPI_UBH);
 	}
 	sb->s_dirt = 1;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 61a6b1542fc..0938945b9cb 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -124,8 +124,7 @@ void ufs_free_inode (struct inode * inode)
 	ubh_mark_buffer_dirty (USPI_UBH);
 	ubh_mark_buffer_dirty (UCPI_UBH);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_wait_on_buffer (UCPI_UBH);
-		ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **) &ucpi);
+		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
 		ubh_wait_on_buffer (UCPI_UBH);
 	}
 	
@@ -249,8 +248,7 @@ cg_found:
 	ubh_mark_buffer_dirty (USPI_UBH);
 	ubh_mark_buffer_dirty (UCPI_UBH);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_wait_on_buffer (UCPI_UBH);
-		ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **) &ucpi);
+		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
 		ubh_wait_on_buffer (UCPI_UBH);
 	}
 	sb->s_dirt = 1;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 718627ca8b5..55f4aa16e3f 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -804,6 +804,7 @@ int ufs_sync_inode (struct inode *inode)
 
 void ufs_delete_inode (struct inode * inode)
 {
+	truncate_inode_pages(&inode->i_data, 0);
 	/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
 	lock_kernel();
 	mark_inode_dirty(inode);
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index e312bf8bad9..61d2e35012a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -285,8 +285,7 @@ next:;
 		}
 	}
 	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-		ubh_wait_on_buffer (ind_ubh);
-		ubh_ll_rw_block (WRITE, 1, &ind_ubh);
+		ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
 		ubh_wait_on_buffer (ind_ubh);
 	}
 	ubh_brelse (ind_ubh);
@@ -353,8 +352,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 		}
 	}
 	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-		ubh_wait_on_buffer (dind_bh);
-		ubh_ll_rw_block (WRITE, 1, &dind_bh);
+		ubh_ll_rw_block (SWRITE, 1, &dind_bh);
 		ubh_wait_on_buffer (dind_bh);
 	}
 	ubh_brelse (dind_bh);
@@ -418,8 +416,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 		}
 	}
 	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-		ubh_wait_on_buffer (tind_bh);
-		ubh_ll_rw_block (WRITE, 1, &tind_bh);
+		ubh_ll_rw_block (SWRITE, 1, &tind_bh);
 		ubh_wait_on_buffer (tind_bh);
 	}
 	ubh_brelse (tind_bh);
diff --git a/fs/umsdos/notes b/fs/umsdos/notes
deleted file mode 100644
index 3c47d1f4fc4..00000000000
--- a/fs/umsdos/notes
+++ /dev/null
@@ -1,17 +0,0 @@
-This file contain idea and things I don't want to forget
-
-Possible bug in fs/read_write.c
-Function sys_readdir()
-
-	There is a call the verify_area that does not take in account
-	the count parameter. I guess it should read
-
-	error = verify_area(VERIFY_WRITE, dirent, count*sizeof (*dirent));
-
-	instead of
-
-	error = verify_area(VERIFY_WRITE, dirent, sizeof (*dirent));
-
-	Of course, now , count is always 1
-
-
diff --git a/fs/xattr.c b/fs/xattr.c
index 6acd5c63da9..3f9c64bea15 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -51,20 +51,29 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		}
 	}
 
+	down(&d->d_inode->i_sem);
+	error = security_inode_setxattr(d, kname, kvalue, size, flags);
+	if (error)
+		goto out;
 	error = -EOPNOTSUPP;
 	if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
-		down(&d->d_inode->i_sem);
-		error = security_inode_setxattr(d, kname, kvalue, size, flags);
-		if (error)
-			goto out;
-		error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags);
+		error = d->d_inode->i_op->setxattr(d, kname, kvalue,
+						   size, flags);
 		if (!error) {
 			fsnotify_xattr(d);
-			security_inode_post_setxattr(d, kname, kvalue, size, flags);
+			security_inode_post_setxattr(d, kname, kvalue,
+						     size, flags);
 		}
-out:
-		up(&d->d_inode->i_sem);
+	} else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
+			    sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
+		error = security_inode_setsecurity(d->d_inode, suffix, kvalue,
+						   size, flags);
+		if (!error)
+			fsnotify_xattr(d);
 	}
+out:
+	up(&d->d_inode->i_sem);
 	if (kvalue)
 		kfree(kvalue);
 	return error;
@@ -139,20 +148,25 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 			return -ENOMEM;
 	}
 
+	error = security_inode_getxattr(d, kname);
+	if (error)
+		goto out;
 	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->getxattr) {
-		error = security_inode_getxattr(d, kname);
-		if (error)
-			goto out;
+	if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
 		error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
-		if (error > 0) {
-			if (size && copy_to_user(value, kvalue, error))
-				error = -EFAULT;
-		} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
-			/* The file system tried to returned a value bigger
-			   than XATTR_SIZE_MAX bytes. Not possible. */
-			error = -E2BIG;
-		}
+	else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
+			  sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
+		error = security_inode_getsecurity(d->d_inode, suffix, kvalue,
+						   size);
+	}
+	if (error > 0) {
+		if (size && copy_to_user(value, kvalue, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+		/* The file system tried to returned a value bigger
+		   than XATTR_SIZE_MAX bytes. Not possible. */
+		error = -E2BIG;
 	}
 out:
 	if (kvalue)
@@ -221,20 +235,24 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 			return -ENOMEM;
 	}
 
+	error = security_inode_listxattr(d);
+	if (error)
+		goto out;
 	error = -EOPNOTSUPP;
 	if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
-		error = security_inode_listxattr(d);
-		if (error)
-			goto out;
 		error = d->d_inode->i_op->listxattr(d, klist, size);
-		if (error > 0) {
-			if (size && copy_to_user(list, klist, error))
-				error = -EFAULT;
-		} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
-			/* The file system tried to returned a list bigger
-			   than XATTR_LIST_MAX bytes. Not possible. */
-			error = -E2BIG;
-		}
+	} else {
+		error = security_inode_listsecurity(d->d_inode, klist, size);
+		if (size && error >= size)
+			error = -ERANGE;
+	}
+	if (error > 0) {
+		if (size && copy_to_user(list, klist, error))
+			error = -EFAULT;
+	} else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
+		/* The file system tried to returned a list bigger
+		   than XATTR_LIST_MAX bytes. Not possible. */
+		error = -E2BIG;
 	}
 out:
 	if (klist)
@@ -307,6 +325,8 @@ removexattr(struct dentry *d, char __user *name)
 		down(&d->d_inode->i_sem);
 		error = d->d_inode->i_op->removexattr(d, kname);
 		up(&d->d_inode->i_sem);
+		if (!error)
+			fsnotify_xattr(d);
 	}
 out:
 	return error;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index c92306f0fdc..8e8f32dabe5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,5 +1,3 @@
-menu "XFS support"
-
 config XFS_FS
 	tristate "XFS filesystem support"
 	select EXPORTFS if NFSD!=n
@@ -22,27 +20,11 @@ config XFS_FS
 
 config XFS_EXPORT
 	bool
-	default y if XFS_FS && EXPORTFS
-
-config XFS_RT
-	bool "Realtime support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
-	help
-	  If you say Y here you will be able to mount and use XFS filesystems
-	  which contain a realtime subvolume. The realtime subvolume is a
-	  separate area of disk space where only file data is stored. The
-	  realtime subvolume is designed to provide very deterministic
-	  data rates suitable for media streaming applications.
-
-	  See the xfs man page in section 5 for a bit more information.
-
-	  This feature is unsupported at this time, is not yet fully
-	  functional, and may cause serious problems.
-
-	  If unsure, say N.
+	depends on XFS_FS && EXPORTFS
+	default y
 
 config XFS_QUOTA
-	bool "Quota support"
+	tristate "XFS Quota support"
 	depends on XFS_FS
 	help
 	  If you say Y here, you will be able to set limits for disk usage on
@@ -59,7 +41,7 @@ config XFS_QUOTA
 	  they are completely independent subsystems.
 
 config XFS_SECURITY
-	bool "Security Label support"
+	bool "XFS Security Label support"
 	depends on XFS_FS
 	help
 	  Security labels support alternative access control models
@@ -71,7 +53,7 @@ config XFS_SECURITY
 	  extended attributes for inode security labels, say N.
 
 config XFS_POSIX_ACL
-	bool "POSIX ACL support"
+	bool "XFS POSIX ACL support"
 	depends on XFS_FS
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
@@ -82,4 +64,19 @@ config XFS_POSIX_ACL
 
 	  If you don't know what Access Control Lists are, say N.
 
-endmenu
+config XFS_RT
+	bool "XFS Realtime support (EXPERIMENTAL)"
+	depends on XFS_FS && EXPERIMENTAL
+	help
+	  If you say Y here you will be able to mount and use XFS filesystems
+	  which contain a realtime subvolume. The realtime subvolume is a
+	  separate area of disk space where only file data is stored. The
+	  realtime subvolume is designed to provide very deterministic
+	  data rates suitable for media streaming applications.
+
+	  See the xfs man page in section 5 for a bit more information.
+
+	  This feature is unsupported at this time, is not yet fully
+	  functional, and may cause serious problems.
+
+	  If unsure, say N.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d3ff7835463..49e3e7e5e3d 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -1,150 +1 @@
-#
-# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of version 2 of the GNU General Public License as
-# published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it would be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# Further, this software is distributed without any warranty that it is
-# free of the rightful claim of any third person regarding infringement
-# or the like.  Any license provided herein, whether implied or
-# otherwise, applies only to this software file.  Patent licenses, if
-# any, provided herein do not apply to combinations of this program with
-# other software, or any other product whatsoever.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write the Free Software Foundation, Inc., 59
-# Temple Place - Suite 330, Boston MA 02111-1307, USA.
-#
-# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
-# Mountain View, CA  94043, or:
-#
-# http://www.sgi.com
-#
-# For further information regarding this notice, see:
-#
-# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
-#
-
-EXTRA_CFLAGS +=	 -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
-
-ifeq ($(CONFIG_XFS_DEBUG),y)
-	EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
-	EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
-endif
-ifeq ($(CONFIG_XFS_TRACE),y)
-	EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
-	EXTRA_CFLAGS += -DXFS_ATTR_TRACE
-	EXTRA_CFLAGS += -DXFS_BLI_TRACE
-	EXTRA_CFLAGS += -DXFS_BMAP_TRACE
-	EXTRA_CFLAGS += -DXFS_BMBT_TRACE
-	EXTRA_CFLAGS += -DXFS_DIR_TRACE
-	EXTRA_CFLAGS += -DXFS_DIR2_TRACE
-	EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
-	EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
-	EXTRA_CFLAGS += -DXFS_LOG_TRACE
-	EXTRA_CFLAGS += -DXFS_RW_TRACE
-	EXTRA_CFLAGS += -DPAGEBUF_TRACE
-	EXTRA_CFLAGS += -DXFS_VNODE_TRACE
-endif
-
-obj-$(CONFIG_XFS_FS)		+= xfs.o
-
-xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
-				   xfs_dquot.o \
-				   xfs_dquot_item.o \
-				   xfs_trans_dquot.o \
-				   xfs_qm_syscalls.o \
-				   xfs_qm_bhv.o \
-				   xfs_qm.o)
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
-endif
-
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
-xfs-$(CONFIG_PROC_FS)		+= linux-2.6/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)		+= linux-2.6/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)		+= linux-2.6/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT)	+= linux-2.6/xfs_export.o
-
-
-xfs-y				+= xfs_alloc.o \
-				   xfs_alloc_btree.o \
-				   xfs_attr.o \
-				   xfs_attr_leaf.o \
-				   xfs_behavior.o \
-				   xfs_bit.o \
-				   xfs_bmap.o \
-				   xfs_bmap_btree.o \
-				   xfs_btree.o \
-				   xfs_buf_item.o \
-				   xfs_da_btree.o \
-				   xfs_dir.o \
-				   xfs_dir2.o \
-				   xfs_dir2_block.o \
-				   xfs_dir2_data.o \
-				   xfs_dir2_leaf.o \
-				   xfs_dir2_node.o \
-				   xfs_dir2_sf.o \
-				   xfs_dir_leaf.o \
-				   xfs_error.o \
-				   xfs_extfree_item.o \
-				   xfs_fsops.o \
-				   xfs_ialloc.o \
-				   xfs_ialloc_btree.o \
-				   xfs_iget.o \
-				   xfs_inode.o \
-				   xfs_inode_item.o \
-				   xfs_iocore.o \
-				   xfs_iomap.o \
-				   xfs_itable.o \
-				   xfs_dfrag.o \
-				   xfs_log.o \
-				   xfs_log_recover.o \
-				   xfs_macros.o \
-				   xfs_mount.o \
-				   xfs_rename.o \
-				   xfs_trans.o \
-				   xfs_trans_ail.o \
-				   xfs_trans_buf.o \
-				   xfs_trans_extfree.o \
-				   xfs_trans_inode.o \
-				   xfs_trans_item.o \
-				   xfs_utils.o \
-				   xfs_vfsops.o \
-				   xfs_vnodeops.o \
-				   xfs_rw.o \
-				   xfs_dmops.o \
-				   xfs_qmops.o
-
-xfs-$(CONFIG_XFS_TRACE)		+= xfs_dir2_trace.o
-
-# Objects in linux-2.6/
-xfs-y				+= $(addprefix linux-2.6/, \
-				   kmem.o \
-				   xfs_aops.o \
-				   xfs_buf.o \
-				   xfs_file.o \
-				   xfs_fs_subr.o \
-				   xfs_globals.o \
-				   xfs_ioctl.o \
-				   xfs_iops.o \
-				   xfs_lrw.o \
-				   xfs_super.o \
-				   xfs_vfs.o \
-				   xfs_vnode.o)
-
-# Objects in support/
-xfs-y				+= $(addprefix support/, \
-				   debug.o \
-				   move.o \
-				   qsort.o \
-				   uuid.o)
-
-xfs-$(CONFIG_XFS_TRACE)		+= support/ktrace.o
-
+include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL)
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
new file mode 100644
index 00000000000..d8c87fa21ad
--- /dev/null
+++ b/fs/xfs/Makefile-linux-2.6
@@ -0,0 +1,152 @@
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+
+EXTRA_CFLAGS +=	 -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
+
+XFS_LINUX := linux-2.6
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
+	EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
+endif
+ifeq ($(CONFIG_XFS_TRACE),y)
+	EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
+	EXTRA_CFLAGS += -DXFS_ATTR_TRACE
+	EXTRA_CFLAGS += -DXFS_BLI_TRACE
+	EXTRA_CFLAGS += -DXFS_BMAP_TRACE
+	EXTRA_CFLAGS += -DXFS_BMBT_TRACE
+	EXTRA_CFLAGS += -DXFS_DIR_TRACE
+	EXTRA_CFLAGS += -DXFS_DIR2_TRACE
+	EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
+	EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
+	EXTRA_CFLAGS += -DXFS_LOG_TRACE
+	EXTRA_CFLAGS += -DXFS_RW_TRACE
+	EXTRA_CFLAGS += -DPAGEBUF_TRACE
+	EXTRA_CFLAGS += -DXFS_VNODE_TRACE
+endif
+
+obj-$(CONFIG_XFS_FS)		+= xfs.o
+
+xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
+				   xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o)
+
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
+endif
+
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
+xfs-$(CONFIG_PROC_FS)		+= $(XFS_LINUX)/xfs_stats.o
+xfs-$(CONFIG_SYSCTL)		+= $(XFS_LINUX)/xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)		+= $(XFS_LINUX)/xfs_ioctl32.o
+xfs-$(CONFIG_XFS_EXPORT)	+= $(XFS_LINUX)/xfs_export.o
+
+
+xfs-y				+= xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_behavior.o \
+				   xfs_bit.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_buf_item.o \
+				   xfs_da_btree.o \
+				   xfs_dir.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dir_leaf.o \
+				   xfs_error.o \
+				   xfs_extfree_item.o \
+				   xfs_fsops.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_iget.o \
+				   xfs_inode.o \
+				   xfs_inode_item.o \
+				   xfs_iocore.o \
+				   xfs_iomap.o \
+				   xfs_itable.o \
+				   xfs_dfrag.o \
+				   xfs_log.o \
+				   xfs_log_recover.o \
+				   xfs_macros.o \
+				   xfs_mount.o \
+				   xfs_rename.o \
+				   xfs_trans.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o \
+				   xfs_trans_extfree.o \
+				   xfs_trans_inode.o \
+				   xfs_trans_item.o \
+				   xfs_utils.o \
+				   xfs_vfsops.o \
+				   xfs_vnodeops.o \
+				   xfs_rw.o \
+				   xfs_dmops.o \
+				   xfs_qmops.o
+
+xfs-$(CONFIG_XFS_TRACE)		+= xfs_dir2_trace.o
+
+# Objects in linux/
+xfs-y				+= $(addprefix $(XFS_LINUX)/, \
+				   kmem.o \
+				   xfs_aops.o \
+				   xfs_buf.o \
+				   xfs_file.o \
+				   xfs_fs_subr.o \
+				   xfs_globals.o \
+				   xfs_ioctl.o \
+				   xfs_iops.o \
+				   xfs_lrw.o \
+				   xfs_super.o \
+				   xfs_vfs.o \
+				   xfs_vnode.o)
+
+# Objects in support/
+xfs-y				+= $(addprefix support/, \
+				   debug.o \
+				   move.o \
+				   uuid.o)
+
+xfs-$(CONFIG_XFS_TRACE)		+= support/ktrace.o
+
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 364ea8c386b..4b184559f23 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -45,11 +45,11 @@
 
 
 void *
-kmem_alloc(size_t size, int flags)
+kmem_alloc(size_t size, unsigned int __nocast flags)
 {
-	int	retries = 0;
-	int	lflags = kmem_flags_convert(flags);
-	void	*ptr;
+	int		retries = 0;
+	unsigned int	lflags = kmem_flags_convert(flags);
+	void		*ptr;
 
 	do {
 		if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
@@ -67,7 +67,7 @@ kmem_alloc(size_t size, int flags)
 }
 
 void *
-kmem_zalloc(size_t size, int flags)
+kmem_zalloc(size_t size, unsigned int __nocast flags)
 {
 	void	*ptr;
 
@@ -89,7 +89,8 @@ kmem_free(void *ptr, size_t size)
 }
 
 void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
+	     unsigned int __nocast flags)
 {
 	void	*new;
 
@@ -104,11 +105,11 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
 }
 
 void *
-kmem_zone_alloc(kmem_zone_t *zone, int flags)
+kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 {
-	int	retries = 0;
-	int	lflags = kmem_flags_convert(flags);
-	void	*ptr;
+	int		retries = 0;
+	unsigned int	lflags = kmem_flags_convert(flags);
+	void		*ptr;
 
 	do {
 		ptr = kmem_cache_alloc(zone, lflags);
@@ -123,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, int flags)
 }
 
 void *
-kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
 {
 	void	*ptr;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 1397b669b05..109fcf27e25 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -39,10 +39,10 @@
 /*
  * memory management routines
  */
-#define KM_SLEEP	0x0001
-#define KM_NOSLEEP	0x0002
-#define KM_NOFS		0x0004
-#define KM_MAYFAIL	0x0008
+#define KM_SLEEP	0x0001u
+#define KM_NOSLEEP	0x0002u
+#define KM_NOFS		0x0004u
+#define KM_MAYFAIL	0x0008u
 
 #define	kmem_zone	kmem_cache_s
 #define kmem_zone_t	kmem_cache_t
@@ -81,9 +81,9 @@ typedef unsigned long xfs_pflags_t;
 	*(NSTATEP) = *(OSTATEP);	\
 } while (0)
 
-static __inline unsigned int kmem_flags_convert(int flags)
+static __inline unsigned int kmem_flags_convert(unsigned int __nocast flags)
 {
-	int	lflags = __GFP_NOWARN;	/* we'll report problems, if need be */
+	unsigned int	lflags = __GFP_NOWARN;	/* we'll report problems, if need be */
 
 #ifdef DEBUG
 	if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
@@ -125,12 +125,13 @@ kmem_zone_destroy(kmem_zone_t *zone)
 		BUG();
 }
 
-extern void	    *kmem_zone_zalloc(kmem_zone_t *, int);
-extern void	    *kmem_zone_alloc(kmem_zone_t *, int);
+extern void	    *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+extern void	    *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
 
-extern void	    *kmem_alloc(size_t, int);
-extern void	    *kmem_realloc(void *, size_t, size_t, int);
-extern void	    *kmem_zalloc(size_t, int);
+extern void	    *kmem_alloc(size_t, unsigned int __nocast);
+extern void	    *kmem_realloc(void *, size_t, size_t,
+				  unsigned int __nocast);
+extern void	    *kmem_zalloc(size_t, unsigned int __nocast);
 extern void         kmem_free(void *, size_t);
 
 typedef struct shrinker *kmem_shaker_t;
diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h
index bcf60a0b8df..0039504069a 100644
--- a/fs/xfs/linux-2.6/spin.h
+++ b/fs/xfs/linux-2.6/spin.h
@@ -45,6 +45,9 @@
 typedef spinlock_t lock_t;
 
 #define SPLDECL(s)			unsigned long s
+#ifndef DEFINE_SPINLOCK
+#define DEFINE_SPINLOCK(s)		spinlock_t s = SPIN_LOCK_UNLOCKED
+#endif
 
 #define spinlock_init(lock, name)	spin_lock_init(lock)
 #define	spinlock_destroy(lock)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a3a4b5aaf5d..c6c077978fe 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -104,66 +104,114 @@ xfs_page_trace(
 #define xfs_page_trace(tag, inode, page, mask)
 #endif
 
-void
-linvfs_unwritten_done(
-	struct buffer_head	*bh,
-	int			uptodate)
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend.
+ */
+STATIC void
+xfs_finish_ioend(
+	xfs_ioend_t		*ioend)
 {
-	xfs_buf_t		*pb = (xfs_buf_t *)bh->b_private;
+	if (atomic_dec_and_test(&ioend->io_remaining))
+		queue_work(xfsdatad_workqueue, &ioend->io_work);
+}
 
-	ASSERT(buffer_unwritten(bh));
-	bh->b_end_io = NULL;
-	clear_buffer_unwritten(bh);
-	if (!uptodate)
-		pagebuf_ioerror(pb, EIO);
-	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-		pagebuf_iodone(pb, 1, 1);
-	}
-	end_buffer_async_write(bh, uptodate);
+STATIC void
+xfs_destroy_ioend(
+	xfs_ioend_t		*ioend)
+{
+	vn_iowake(ioend->io_vnode);
+	mempool_free(ioend, xfs_ioend_pool);
 }
 
 /*
  * Issue transactions to convert a buffer range from unwritten
- * to written extents (buffered IO).
+ * to written extents.
  */
 STATIC void
-linvfs_unwritten_convert(
-	xfs_buf_t	*bp)
+xfs_end_bio_unwritten(
+	void			*data)
 {
-	vnode_t		*vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
-	int		error;
+	xfs_ioend_t		*ioend = data;
+	vnode_t			*vp = ioend->io_vnode;
+	xfs_off_t		offset = ioend->io_offset;
+	size_t			size = ioend->io_size;
+	struct buffer_head	*bh, *next;
+	int			error;
+
+	if (ioend->io_uptodate)
+		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+
+	/* ioend->io_buffer_head is only non-NULL for buffered I/O */
+	for (bh = ioend->io_buffer_head; bh; bh = next) {
+		next = bh->b_private;
+
+		bh->b_end_io = NULL;
+		clear_buffer_unwritten(bh);
+		end_buffer_async_write(bh, ioend->io_uptodate);
+	}
 
-	BUG_ON(atomic_read(&bp->pb_hold) < 1);
-	VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
-			BMAPI_UNWRITTEN, NULL, NULL, error);
-	XFS_BUF_SET_FSPRIVATE(bp, NULL);
-	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_UNDATAIO(bp);
-	iput(LINVFS_GET_IP(vp));
-	pagebuf_iodone(bp, 0, 0);
+	xfs_destroy_ioend(ioend);
 }
 
 /*
- * Issue transactions to convert a buffer range from unwritten
- * to written extents (direct IO).
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
  */
-STATIC void
-linvfs_unwritten_convert_direct(
-	struct kiocb	*iocb,
-	loff_t		offset,
-	ssize_t		size,
-	void		*private)
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+	struct inode		*inode)
 {
-	struct inode	*inode = iocb->ki_filp->f_dentry->d_inode;
-	ASSERT(!private || inode == (struct inode *)private);
+	xfs_ioend_t		*ioend;
 
-	/* private indicates an unwritten extent lay beneath this IO */
-	if (private && size > 0) {
-		vnode_t	*vp = LINVFS_GET_VP(inode);
-		int	error;
+	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 
-		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
-	}
+	/*
+	 * Set the count to 1 initially, which will prevent an I/O
+	 * completion callback from happening before we have started
+	 * all the I/O from calling the completion routine too early.
+	 */
+	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_uptodate = 1; /* cleared if any I/O fails */
+	ioend->io_vnode = LINVFS_GET_VP(inode);
+	ioend->io_buffer_head = NULL;
+	atomic_inc(&ioend->io_vnode->v_iocount);
+	ioend->io_offset = 0;
+	ioend->io_size = 0;
+
+	INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
+
+	return ioend;
+}
+
+void
+linvfs_unwritten_done(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	xfs_ioend_t		*ioend = bh->b_private;
+	static spinlock_t	unwritten_done_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+
+	ASSERT(buffer_unwritten(bh));
+	bh->b_end_io = NULL;
+
+	if (!uptodate)
+		ioend->io_uptodate = 0;
+
+	/*
+	 * Deep magic here.  We reuse b_private in the buffer_heads to build
+	 * a chain for completing the I/O from user context after we've issued
+	 * a transaction to convert the unwritten extent.
+	 */
+	spin_lock_irqsave(&unwritten_done_lock, flags);
+	bh->b_private = ioend->io_buffer_head;
+	ioend->io_buffer_head = bh;
+	spin_unlock_irqrestore(&unwritten_done_lock, flags);
+
+	xfs_finish_ioend(ioend);
 }
 
 STATIC int
@@ -255,7 +303,7 @@ xfs_probe_unwritten_page(
 	struct address_space	*mapping,
 	pgoff_t			index,
 	xfs_iomap_t		*iomapp,
-	xfs_buf_t		*pb,
+	xfs_ioend_t		*ioend,
 	unsigned long		max_offset,
 	unsigned long		*fsbs,
 	unsigned int            bbits)
@@ -283,7 +331,7 @@ xfs_probe_unwritten_page(
 				break;
 			xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
 			set_buffer_unwritten_io(bh);
-			bh->b_private = pb;
+			bh->b_private = ioend;
 			p_offset += bh->b_size;
 			(*fsbs)++;
 		} while ((bh = bh->b_this_page) != head);
@@ -434,34 +482,15 @@ xfs_map_unwritten(
 {
 	struct buffer_head	*bh = curr;
 	xfs_iomap_t		*tmp;
-	xfs_buf_t		*pb;
-	loff_t			offset, size;
+	xfs_ioend_t		*ioend;
+	loff_t			offset;
 	unsigned long		nblocks = 0;
 
 	offset = start_page->index;
 	offset <<= PAGE_CACHE_SHIFT;
 	offset += p_offset;
 
-	/* get an "empty" pagebuf to manage IO completion
-	 * Proper values will be set before returning */
-	pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
-	if (!pb)
-		return -EAGAIN;
-
-	/* Take a reference to the inode to prevent it from
-	 * being reclaimed while we have outstanding unwritten
-	 * extent IO on it.
-	 */
-	if ((igrab(inode)) != inode) {
-		pagebuf_free(pb);
-		return -EAGAIN;
-	}
-
-	/* Set the count to 1 initially, this will stop an I/O
-	 * completion callout which happens before we have started
-	 * all the I/O from calling pagebuf_iodone too early.
-	 */
-	atomic_set(&pb->pb_io_remaining, 1);
+	ioend = xfs_alloc_ioend(inode);
 
 	/* First map forwards in the page consecutive buffers
 	 * covering this unwritten extent
@@ -474,12 +503,12 @@ xfs_map_unwritten(
 			break;
 		xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
 		set_buffer_unwritten_io(bh);
-		bh->b_private = pb;
+		bh->b_private = ioend;
 		p_offset += bh->b_size;
 		nblocks++;
 	} while ((bh = bh->b_this_page) != head);
 
-	atomic_add(nblocks, &pb->pb_io_remaining);
+	atomic_add(nblocks, &ioend->io_remaining);
 
 	/* If we reached the end of the page, map forwards in any
 	 * following pages which are also covered by this extent.
@@ -496,13 +525,13 @@ xfs_map_unwritten(
 		tloff = min(tlast, tloff);
 		for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
 			page = xfs_probe_unwritten_page(mapping,
-						tindex, iomapp, pb,
+						tindex, iomapp, ioend,
 						PAGE_CACHE_SIZE, &bs, bbits);
 			if (!page)
 				break;
 			nblocks += bs;
-			atomic_add(bs, &pb->pb_io_remaining);
-			xfs_convert_page(inode, page, iomapp, wbc, pb,
+			atomic_add(bs, &ioend->io_remaining);
+			xfs_convert_page(inode, page, iomapp, wbc, ioend,
 							startio, all_bh);
 			/* stop if converting the next page might add
 			 * enough blocks that the corresponding byte
@@ -514,12 +543,12 @@ xfs_map_unwritten(
 		if (tindex == tlast &&
 		    (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
 			page = xfs_probe_unwritten_page(mapping,
-							tindex, iomapp, pb,
+							tindex, iomapp, ioend,
 							pg_offset, &bs, bbits);
 			if (page) {
 				nblocks += bs;
-				atomic_add(bs, &pb->pb_io_remaining);
-				xfs_convert_page(inode, page, iomapp, wbc, pb,
+				atomic_add(bs, &ioend->io_remaining);
+				xfs_convert_page(inode, page, iomapp, wbc, ioend,
 							startio, all_bh);
 				if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
 					goto enough;
@@ -528,21 +557,9 @@ xfs_map_unwritten(
 	}
 
 enough:
-	size = nblocks;		/* NB: using 64bit number here */
-	size <<= block_bits;	/* convert fsb's to byte range */
-
-	XFS_BUF_DATAIO(pb);
-	XFS_BUF_ASYNC(pb);
-	XFS_BUF_SET_SIZE(pb, size);
-	XFS_BUF_SET_COUNT(pb, size);
-	XFS_BUF_SET_OFFSET(pb, offset);
-	XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
-	XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
-
-	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-		pagebuf_iodone(pb, 1, 1);
-	}
-
+	ioend->io_size = (xfs_off_t)nblocks << block_bits;
+	ioend->io_offset = offset;
+	xfs_finish_ioend(ioend);
 	return 0;
 }
 
@@ -787,7 +804,7 @@ xfs_page_state_convert(
 				continue;
 			if (!iomp) {
 				err = xfs_map_blocks(inode, offset, len, &iomap,
-						BMAPI_READ|BMAPI_IGNSTATE);
+						BMAPI_WRITE|BMAPI_IGNSTATE);
 				if (err) {
 					goto error;
 				}
@@ -1028,6 +1045,44 @@ linvfs_get_blocks_direct(
 					create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
 
+STATIC void
+linvfs_end_io_direct(
+	struct kiocb	*iocb,
+	loff_t		offset,
+	ssize_t		size,
+	void		*private)
+{
+	xfs_ioend_t	*ioend = iocb->private;
+
+	/*
+	 * Non-NULL private data means we need to issue a transaction to
+	 * convert a range from unwritten to written extents.  This needs
+	 * to happen from process contect but aio+dio I/O completion
+	 * happens from irq context so we need to defer it to a workqueue.
+	 * This is not nessecary for synchronous direct I/O, but we do
+	 * it anyway to keep the code uniform and simpler.
+	 *
+	 * The core direct I/O code might be changed to always call the
+	 * completion handler in the future, in which case all this can
+	 * go away.
+	 */
+	if (private && size > 0) {
+		ioend->io_offset = offset;
+		ioend->io_size = size;
+		xfs_finish_ioend(ioend);
+	} else {
+		ASSERT(size >= 0);
+		xfs_destroy_ioend(ioend);
+	}
+
+	/*
+	 * blockdev_direct_IO can return an error even afer the I/O
+	 * completion handler was called.  Thus we need to protect
+	 * against double-freeing.
+	 */
+	iocb->private = NULL;
+}
+
 STATIC ssize_t
 linvfs_direct_IO(
 	int			rw,
@@ -1042,16 +1097,23 @@ linvfs_direct_IO(
 	xfs_iomap_t	iomap;
 	int		maps = 1;
 	int		error;
+	ssize_t		ret;
 
 	VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
 	if (error)
 		return -error;
 
-	return blockdev_direct_IO_own_locking(rw, iocb, inode,
+	iocb->private = xfs_alloc_ioend(inode);
+
+	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
 		iomap.iomap_target->pbr_bdev,
 		iov, offset, nr_segs,
 		linvfs_get_blocks_direct,
-		linvfs_unwritten_convert_direct);
+		linvfs_end_io_direct);
+
+	if (unlikely(ret <= 0 && iocb->private))
+		xfs_destroy_ioend(iocb->private);
+	return ret;
 }
 
 
@@ -1202,6 +1264,16 @@ out_unlock:
 	return error;
 }
 
+STATIC int
+linvfs_invalidate_page(
+	struct page		*page,
+	unsigned long		offset)
+{
+	xfs_page_trace(XFS_INVALIDPAGE_ENTER,
+			page->mapping->host, page, offset);
+	return block_invalidatepage(page, offset);
+}
+
 /*
  * Called to move a page into cleanable state - and from there
  * to be released. Possibly the page is already clean. We always
@@ -1279,6 +1351,7 @@ struct address_space_operations linvfs_aops = {
 	.writepage		= linvfs_writepage,
 	.sync_page		= block_sync_page,
 	.releasepage		= linvfs_release_page,
+	.invalidatepage		= linvfs_invalidate_page,
 	.prepare_write		= linvfs_prepare_write,
 	.commit_write		= generic_commit_write,
 	.bmap			= linvfs_bmap,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
new file mode 100644
index 00000000000..2fa62974a04
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+
+extern struct workqueue_struct *xfsdatad_workqueue;
+extern mempool_t *xfs_ioend_pool;
+
+typedef void (*xfs_ioend_func_t)(void *);
+
+typedef struct xfs_ioend {
+	unsigned int		io_uptodate;	/* I/O status register */
+	atomic_t		io_remaining;	/* hold count */
+	struct vnode		*io_vnode;	/* file being written to */
+	struct buffer_head	*io_buffer_head;/* buffer linked list head */
+	size_t			io_size;	/* size of the extent */
+	xfs_off_t		io_offset;	/* offset in the file */
+	struct work_struct	io_work;	/* xfsdatad work queue */
+} xfs_ioend_t;
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index df0cba239dd..655bf4a78af 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -54,6 +54,7 @@
 #include <linux/percpu.h>
 #include <linux/blkdev.h>
 #include <linux/hash.h>
+#include <linux/kthread.h>
 
 #include "xfs_linux.h"
 
@@ -67,7 +68,7 @@ STATIC int xfsbufd_wakeup(int, unsigned int);
 STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
 
 STATIC struct workqueue_struct *xfslogd_workqueue;
-STATIC struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsdatad_workqueue;
 
 /*
  * Pagebuf debugging
@@ -590,8 +591,10 @@ found:
 		PB_SET_OWNER(pb);
 	}
 
-	if (pb->pb_flags & PBF_STALE)
+	if (pb->pb_flags & PBF_STALE) {
+		ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0);
 		pb->pb_flags &= PBF_MAPPED;
+	}
 	PB_TRACE(pb, "got_lock", 0);
 	XFS_STATS_INC(pb_get_locked);
 	return (pb);
@@ -700,25 +703,6 @@ xfs_buf_read_flags(
 }
 
 /*
- * Create a skeletal pagebuf (no pages associated with it).
- */
-xfs_buf_t *
-pagebuf_lookup(
-	xfs_buftarg_t		*target,
-	loff_t			ioff,
-	size_t			isize,
-	page_buf_flags_t	flags)
-{
-	xfs_buf_t		*pb;
-
-	pb = pagebuf_allocate(flags);
-	if (pb) {
-		_pagebuf_initialize(pb, target, ioff, isize, flags);
-	}
-	return pb;
-}
-
-/*
  * If we are not low on memory then do the readahead in a deadlock
  * safe manner.
  */
@@ -913,22 +897,23 @@ pagebuf_rele(
 			do_free = 0;
 		}
 
-		if (pb->pb_flags & PBF_DELWRI) {
-			pb->pb_flags |= PBF_ASYNC;
-			atomic_inc(&pb->pb_hold);
-			pagebuf_delwri_queue(pb, 0);
-			do_free = 0;
-		} else if (pb->pb_flags & PBF_FS_MANAGED) {
+		if (pb->pb_flags & PBF_FS_MANAGED) {
 			do_free = 0;
 		}
 
 		if (do_free) {
+			ASSERT((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == 0);
 			list_del_init(&pb->pb_hash_list);
 			spin_unlock(&hash->bh_lock);
 			pagebuf_free(pb);
 		} else {
 			spin_unlock(&hash->bh_lock);
 		}
+	} else {
+		/*
+		 * Catch reference count leaks
+		 */
+		ASSERT(atomic_read(&pb->pb_hold) >= 0);
 	}
 }
 
@@ -1006,13 +991,24 @@ pagebuf_lock(
  *	pagebuf_unlock
  *
  *	pagebuf_unlock releases the lock on the buffer object created by
- *	pagebuf_lock or pagebuf_cond_lock (not any
- *	pinning of underlying pages created by pagebuf_pin).
+ *	pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
+ *	created by pagebuf_pin).
+ *
+ *	If the buffer is marked delwri but is not queued, do so before we
+ *	unlock the buffer as we need to set flags correctly. We also need to
+ *	take a reference for the delwri queue because the unlocker is going to
+ *	drop their's and they don't know we just queued it.
  */
 void
 pagebuf_unlock(				/* unlock buffer		*/
 	xfs_buf_t		*pb)	/* buffer to unlock		*/
 {
+	if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) {
+		atomic_inc(&pb->pb_hold);
+		pb->pb_flags |= PBF_ASYNC;
+		pagebuf_delwri_queue(pb, 0);
+	}
+
 	PB_CLEAR_OWNER(pb);
 	up(&pb->pb_sema);
 	PB_TRACE(pb, "unlock", 0);
@@ -1249,8 +1245,8 @@ bio_end_io_pagebuf(
 	int			error)
 {
 	xfs_buf_t		*pb = (xfs_buf_t *)bio->bi_private;
-	unsigned int		i, blocksize = pb->pb_target->pbr_bsize;
-	struct bio_vec		*bvec = bio->bi_io_vec;
+	unsigned int		blocksize = pb->pb_target->pbr_bsize;
+	struct bio_vec		*bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 
 	if (bio->bi_size)
 		return 1;
@@ -1258,10 +1254,12 @@ bio_end_io_pagebuf(
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		pb->pb_error = EIO;
 
-	for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
+	do {
 		struct page	*page = bvec->bv_page;
 
-		if (pb->pb_error) {
+		if (unlikely(pb->pb_error)) {
+			if (pb->pb_flags & PBF_READ)
+				ClearPageUptodate(page);
 			SetPageError(page);
 		} else if (blocksize == PAGE_CACHE_SIZE) {
 			SetPageUptodate(page);
@@ -1270,10 +1268,13 @@ bio_end_io_pagebuf(
 			set_page_region(page, bvec->bv_offset, bvec->bv_len);
 		}
 
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
 		if (_pagebuf_iolocked(pb)) {
 			unlock_page(page);
 		}
-	}
+	} while (bvec >= bio->bi_io_vec);
 
 	_pagebuf_iodone(pb, 1);
 	bio_put(bio);
@@ -1511,6 +1512,11 @@ again:
 			ASSERT(btp == bp->pb_target);
 			if (!(bp->pb_flags & PBF_FS_MANAGED)) {
 				spin_unlock(&hash->bh_lock);
+				/*
+				 * Catch superblock reference count leaks
+				 * immediately
+				 */
+				BUG_ON(bp->pb_bn == 0);
 				delay(100);
 				goto again;
 			}
@@ -1686,17 +1692,20 @@ pagebuf_delwri_queue(
 	int			unlock)
 {
 	PB_TRACE(pb, "delwri_q", (long)unlock);
-	ASSERT(pb->pb_flags & PBF_DELWRI);
+	ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) ==
+					(PBF_DELWRI|PBF_ASYNC));
 
 	spin_lock(&pbd_delwrite_lock);
 	/* If already in the queue, dequeue and place at tail */
 	if (!list_empty(&pb->pb_list)) {
+		ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
 		if (unlock) {
 			atomic_dec(&pb->pb_hold);
 		}
 		list_del(&pb->pb_list);
 	}
 
+	pb->pb_flags |= _PBF_DELWRI_Q;
 	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
 	pb->pb_queuetime = jiffies;
 	spin_unlock(&pbd_delwrite_lock);
@@ -1713,10 +1722,11 @@ pagebuf_delwri_dequeue(
 
 	spin_lock(&pbd_delwrite_lock);
 	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
+		ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
 		list_del_init(&pb->pb_list);
 		dequeued = 1;
 	}
-	pb->pb_flags &= ~PBF_DELWRI;
+	pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
 	spin_unlock(&pbd_delwrite_lock);
 
 	if (dequeued)
@@ -1733,9 +1743,7 @@ pagebuf_runall_queues(
 }
 
 /* Defines for pagebuf daemon */
-STATIC DECLARE_COMPLETION(xfsbufd_done);
 STATIC struct task_struct *xfsbufd_task;
-STATIC int xfsbufd_active;
 STATIC int xfsbufd_force_flush;
 STATIC int xfsbufd_force_sleep;
 
@@ -1761,14 +1769,8 @@ xfsbufd(
 	xfs_buftarg_t		*target;
 	xfs_buf_t		*pb, *n;
 
-	/*  Set up the thread  */
-	daemonize("xfsbufd");
 	current->flags |= PF_MEMALLOC;
 
-	xfsbufd_task = current;
-	xfsbufd_active = 1;
-	barrier();
-
 	INIT_LIST_HEAD(&tmp);
 	do {
 		if (unlikely(freezing(current))) {
@@ -1795,7 +1797,7 @@ xfsbufd(
 					break;
 				}
 
-				pb->pb_flags &= ~PBF_DELWRI;
+				pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
 				pb->pb_flags |= PBF_WRITE;
 				list_move(&pb->pb_list, &tmp);
 			}
@@ -1816,9 +1818,9 @@ xfsbufd(
 			purge_addresses();
 
 		xfsbufd_force_flush = 0;
-	} while (xfsbufd_active);
+	} while (!kthread_should_stop());
 
-	complete_and_exit(&xfsbufd_done, 0);
+	return 0;
 }
 
 /*
@@ -1845,15 +1847,13 @@ xfs_flush_buftarg(
 		if (pb->pb_target != target)
 			continue;
 
-		ASSERT(pb->pb_flags & PBF_DELWRI);
+		ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q));
 		PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
 		if (pagebuf_ispin(pb)) {
 			pincount++;
 			continue;
 		}
 
-		pb->pb_flags &= ~PBF_DELWRI;
-		pb->pb_flags |= PBF_WRITE;
 		list_move(&pb->pb_list, &tmp);
 	}
 	spin_unlock(&pbd_delwrite_lock);
@@ -1862,12 +1862,14 @@ xfs_flush_buftarg(
 	 * Dropped the delayed write list lock, now walk the temporary list
 	 */
 	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
+		pagebuf_lock(pb);
+		pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
+		pb->pb_flags |= PBF_WRITE;
 		if (wait)
 			pb->pb_flags &= ~PBF_ASYNC;
 		else
 			list_del_init(&pb->pb_list);
 
-		pagebuf_lock(pb);
 		pagebuf_iostrategy(pb);
 	}
 
@@ -1901,9 +1903,11 @@ xfs_buf_daemons_start(void)
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
-	error = kernel_thread(xfsbufd, NULL, CLONE_FS|CLONE_FILES);
-	if (error < 0)
+	xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd");
+	if (IS_ERR(xfsbufd_task)) {
+		error = PTR_ERR(xfsbufd_task);
 		goto out_destroy_xfsdatad_workqueue;
+	}
 	return 0;
 
  out_destroy_xfsdatad_workqueue:
@@ -1920,10 +1924,7 @@ xfs_buf_daemons_start(void)
 STATIC void
 xfs_buf_daemons_stop(void)
 {
-	xfsbufd_active = 0;
-	barrier();
-	wait_for_completion(&xfsbufd_done);
-
+	kthread_stop(xfsbufd_task);
 	destroy_workqueue(xfslogd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 3f8f69a66ae..67c19f79923 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -89,6 +89,7 @@ typedef enum page_buf_flags_e {		/* pb_flags values */
 	_PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache		   */
 	_PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc()		   */
 	_PBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
+	_PBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
 } page_buf_flags_t;
 
 #define PBF_UPDATE (PBF_READ | PBF_WRITE)
@@ -206,13 +207,6 @@ extern xfs_buf_t *xfs_buf_read_flags(	/* allocate and read a buffer	*/
 #define xfs_buf_read(target, blkno, len, flags) \
 	xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
 
-extern xfs_buf_t *pagebuf_lookup(
-		xfs_buftarg_t *,
-		loff_t,			/* starting offset of range	*/
-		size_t,			/* length of range		*/
-		page_buf_flags_t);	/* PBF_READ, PBF_WRITE,		*/
-					/* PBF_FORCEIO, 		*/
-
 extern xfs_buf_t *pagebuf_get_empty(	/* allocate pagebuf struct with	*/
 					/*  no memory or disk address	*/
 		size_t len,
@@ -344,8 +338,6 @@ extern void pagebuf_trace(
 
 
 
-
-
 /* These are just for xfs_syncsub... it sets an internal variable
  * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
  */
@@ -452,7 +444,7 @@ extern void pagebuf_trace(
 
 #define XFS_BUF_PTR(bp)		(xfs_caddr_t)((bp)->pb_addr)
 
-extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
+static inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
 {
 	if (bp->pb_flags & PBF_MAPPED)
 		return XFS_BUF_PTR(bp) + offset;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f1ce4323f56..3881622bcf0 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -311,6 +311,31 @@ linvfs_fsync(
 
 #define nextdp(dp)      ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
 
+#ifdef CONFIG_XFS_DMAPI
+
+STATIC struct page *
+linvfs_filemap_nopage(
+	struct vm_area_struct	*area,
+	unsigned long		address,
+	int			*type)
+{
+	struct inode	*inode = area->vm_file->f_dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
+	int		error;
+
+	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
+
+	error = XFS_SEND_MMAP(mp, area, 0);
+	if (error)
+		return NULL;
+
+	return filemap_nopage(area, address, type);
+}
+
+#endif /* CONFIG_XFS_DMAPI */
+
+
 STATIC int
 linvfs_readdir(
 	struct file	*filp,
@@ -390,14 +415,6 @@ done:
 	return -error;
 }
 
-#ifdef CONFIG_XFS_DMAPI
-STATIC void
-linvfs_mmap_close(
-	struct vm_area_struct	*vma)
-{
-	xfs_dm_mm_put(vma);
-}
-#endif /* CONFIG_XFS_DMAPI */
 
 STATIC int
 linvfs_file_mmap(
@@ -411,16 +428,11 @@ linvfs_file_mmap(
 
 	vma->vm_ops = &linvfs_file_vm_ops;
 
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-
-		error = -XFS_SEND_MMAP(mp, vma, 0);
-		if (error)
-			return error;
 #ifdef CONFIG_XFS_DMAPI
+	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
 		vma->vm_ops = &linvfs_dmapi_file_vm_ops;
-#endif
 	}
+#endif /* CONFIG_XFS_DMAPI */
 
 	VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error);
 	if (!error)
@@ -474,6 +486,7 @@ linvfs_ioctl_invis(
 	return error;
 }
 
+#ifdef CONFIG_XFS_DMAPI
 #ifdef HAVE_VMOP_MPROTECT
 STATIC int
 linvfs_mprotect(
@@ -494,6 +507,7 @@ linvfs_mprotect(
 	return error;
 }
 #endif /* HAVE_VMOP_MPROTECT */
+#endif /* CONFIG_XFS_DMAPI */
 
 #ifdef HAVE_FOP_OPEN_EXEC
 /* If the user is attempting to execute a file that is offline then
@@ -528,49 +542,10 @@ open_exec_out:
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
 
-/*
- * Temporary workaround to the AIO direct IO write problem.
- * This code can go and we can revert to do_sync_write once
- * the writepage(s) rework is merged.
- */
-STATIC ssize_t
-linvfs_write(
-	struct file	*filp,
-	const char	__user *buf,
-	size_t		len,
-	loff_t		*ppos)
-{
-	struct kiocb	kiocb;
-	ssize_t		ret;
-
-	init_sync_kiocb(&kiocb, filp);
-	kiocb.ki_pos = *ppos;
-	ret = __linvfs_write(&kiocb, buf, 0, len, kiocb.ki_pos);
-	*ppos = kiocb.ki_pos;
-	return ret;
-}
-STATIC ssize_t
-linvfs_write_invis(
-	struct file	*filp,
-	const char	__user *buf,
-	size_t		len,
-	loff_t		*ppos)
-{
-	struct kiocb	kiocb;
-	ssize_t		ret;
-
-	init_sync_kiocb(&kiocb, filp);
-	kiocb.ki_pos = *ppos;
-	ret = __linvfs_write(&kiocb, buf, IO_INVIS, len, kiocb.ki_pos);
-	*ppos = kiocb.ki_pos;
-	return ret;
-}
-
-
 struct file_operations linvfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.write		= linvfs_write,
+	.write		= do_sync_write,
 	.readv		= linvfs_readv,
 	.writev		= linvfs_writev,
 	.aio_read	= linvfs_aio_read,
@@ -592,7 +567,7 @@ struct file_operations linvfs_file_operations = {
 struct file_operations linvfs_invis_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.write		= linvfs_write_invis,
+	.write		= do_sync_write,
 	.readv		= linvfs_readv_invis,
 	.writev		= linvfs_writev_invis,
 	.aio_read	= linvfs_aio_read_invis,
@@ -626,8 +601,7 @@ static struct vm_operations_struct linvfs_file_vm_ops = {
 
 #ifdef CONFIG_XFS_DMAPI
 static struct vm_operations_struct linvfs_dmapi_file_vm_ops = {
-	.close		= linvfs_mmap_close,
-	.nopage		= filemap_nopage,
+	.nopage		= linvfs_filemap_nopage,
 	.populate	= filemap_populate,
 #ifdef HAVE_VMOP_MPROTECT
 	.mprotect	= linvfs_mprotect,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 05a447e51cc..6a3326bcd8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -141,13 +141,19 @@ xfs_find_handle(
 		return -XFS_ERROR(EINVAL);
 	}
 
-	/* we need the vnode */
-	vp = LINVFS_GET_VP(inode);
-	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		break;
+	default:
 		iput(inode);
 		return -XFS_ERROR(EBADF);
 	}
 
+	/* we need the vnode */
+	vp = LINVFS_GET_VP(inode);
+
 	/* now we can grab the fsid */
 	memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
 	hsize = sizeof(xfs_fsid_t);
@@ -386,7 +392,7 @@ xfs_readlink_by_handle(
 		return -error;
 
 	/* Restrict this handle operation to symlinks only. */
-	if (vp->v_type != VLNK) {
+	if (!S_ISLNK(inode->i_mode)) {
 		VN_RELE(vp);
 		return -XFS_ERROR(EINVAL);
 	}
@@ -982,10 +988,10 @@ xfs_ioc_space(
 	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
 		return -XFS_ERROR(EPERM);
 
-	if (!(filp->f_flags & FMODE_WRITE))
+	if (!(filp->f_mode & FMODE_WRITE))
 		return -XFS_ERROR(EBADF);
 
-	if (vp->v_type != VREG)
+	if (!VN_ISREG(vp))
 		return -XFS_ERROR(EINVAL);
 
 	if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0f8f1384eb3..4636b7f86f1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -47,8 +47,52 @@
 #include "xfs_vnode.h"
 #include "xfs_dfrag.h"
 
+#define  _NATIVE_IOC(cmd, type) \
+	  _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
+
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
 #define BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct xfs_flock64_32 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start	__attribute__((packed));
+			/* len == 0 means until end of file */
+	__s64		l_len __attribute__((packed));
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area */
+} xfs_flock64_32_t;
+
+#define XFS_IOC_ALLOCSP_32	_IOW ('X', 10, struct xfs_flock64_32)
+#define XFS_IOC_FREESP_32	_IOW ('X', 11, struct xfs_flock64_32)
+#define XFS_IOC_ALLOCSP64_32	_IOW ('X', 36, struct xfs_flock64_32)
+#define XFS_IOC_FREESP64_32	_IOW ('X', 37, struct xfs_flock64_32)
+#define XFS_IOC_RESVSP_32	_IOW ('X', 40, struct xfs_flock64_32)
+#define XFS_IOC_UNRESVSP_32	_IOW ('X', 41, struct xfs_flock64_32)
+#define XFS_IOC_RESVSP64_32	_IOW ('X', 42, struct xfs_flock64_32)
+#define XFS_IOC_UNRESVSP64_32	_IOW ('X', 43, struct xfs_flock64_32)
+
+/* just account for different alignment */
+STATIC unsigned long
+xfs_ioctl32_flock(
+	unsigned long		arg)
+{
+	xfs_flock64_32_t	__user *p32 = (void __user *)arg;
+	xfs_flock64_t		__user *p = compat_alloc_user_space(sizeof(*p));
+
+	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
+	    copy_in_user(&p->l_whence,	&p32->l_whence, sizeof(s16)) ||
+	    copy_in_user(&p->l_start,	&p32->l_start,	sizeof(s64)) ||
+	    copy_in_user(&p->l_len,	&p32->l_len,	sizeof(s64)) ||
+	    copy_in_user(&p->l_sysid,	&p32->l_sysid,	sizeof(s32)) ||
+	    copy_in_user(&p->l_pid,	&p32->l_pid,	sizeof(u32)) ||
+	    copy_in_user(&p->l_pad,	&p32->l_pad,	4*sizeof(u32)))
+		return -EFAULT;
+	
+	return (unsigned long)p;
+}
+
 #else
 
 typedef struct xfs_fsop_bulkreq32 {
@@ -103,7 +147,6 @@ __linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
 /* not handled
 	case XFS_IOC_FD_TO_HANDLE:
 	case XFS_IOC_PATH_TO_HANDLE:
-	case XFS_IOC_PATH_TO_HANDLE:
 	case XFS_IOC_PATH_TO_FSHANDLE:
 	case XFS_IOC_OPEN_BY_HANDLE:
 	case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -124,8 +167,21 @@ __linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
 	case XFS_IOC_ERROR_CLEARALL:
 		break;
 
-#ifndef BROKEN_X86_ALIGNMENT
-	/* xfs_flock_t and xfs_bstat_t have wrong u32 vs u64 alignment */
+#ifdef BROKEN_X86_ALIGNMENT
+	/* xfs_flock_t has wrong u32 vs u64 alignment */
+	case XFS_IOC_ALLOCSP_32:
+	case XFS_IOC_FREESP_32:
+	case XFS_IOC_ALLOCSP64_32:
+	case XFS_IOC_FREESP64_32:
+	case XFS_IOC_RESVSP_32:
+	case XFS_IOC_UNRESVSP_32:
+	case XFS_IOC_RESVSP64_32:
+	case XFS_IOC_UNRESVSP64_32:
+		arg = xfs_ioctl32_flock(arg);
+		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+		break;
+
+#else /* These are handled fine if no alignment issues */
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_RESVSP:
@@ -134,6 +190,9 @@ __linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
 	case XFS_IOC_FREESP64:
 	case XFS_IOC_RESVSP64:
 	case XFS_IOC_UNRESVSP64:
+		break;
+
+	/* xfs_bstat_t still has wrong u32 vs u64 alignment */
 	case XFS_IOC_SWAPEXT:
 		break;
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index f252605514e..77708a8c9f8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -140,7 +140,6 @@ linvfs_mknod(
 
 	memset(&va, 0, sizeof(va));
 	va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
-	va.va_type = IFTOVT(mode);
 	va.va_mode = mode;
 
 	switch (mode & S_IFMT) {
@@ -308,14 +307,13 @@ linvfs_symlink(
 	cvp = NULL;
 
 	memset(&va, 0, sizeof(va));
-	va.va_type = VLNK;
-	va.va_mode = irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO;
+	va.va_mode = S_IFLNK |
+		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
 	va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
 
 	error = 0;
 	VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
 	if (!error && cvp) {
-		ASSERT(cvp->v_type == VLNK);
 		ip = LINVFS_GET_IP(cvp);
 		d_instantiate(dentry, ip);
 		validate_fields(dir);
@@ -425,9 +423,14 @@ linvfs_follow_link(
 	return NULL;
 }
 
-static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+STATIC void
+linvfs_put_link(
+	struct dentry	*dentry,
+	struct nameidata *nd,
+	void		*p)
 {
-	char *s = nd_get_link(nd);
+	char		*s = nd_get_link(nd);
+
 	if (!IS_ERR(s))
 		kfree(s);
 }
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 42dc5e4662e..68c5d885ed9 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -64,7 +64,6 @@
 #include <sema.h>
 #include <time.h>
 
-#include <support/qsort.h>
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/move.h>
@@ -104,6 +103,7 @@
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
 #include <xfs_iops.h>
+#include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_globals.h>
 #include <xfs_fs_subr.h>
@@ -254,11 +254,18 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define MAX(a,b)	(max(a,b))
 #define howmany(x, y)	(((x)+((y)-1))/(y))
 #define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
+#define qsort(a,n,s,fn)	sort(a,n,s,fn,NULL)
 
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
 #define xfs_stack_trace()	dump_stack()
-
 #define xfs_itruncate_data(ip, off)	\
 	(-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
+#define xfs_statvfs_fsid(statp, mp)	\
+	({ u64 id = huge_encode_dev((mp)->m_dev);	\
+	   __kernel_fsid_t *fsid = &(statp)->f_fsid;	\
+	(fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); })
 
 
 /* Move the kernel do_div definition off to one side */
@@ -371,6 +378,4 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
 	return(x * y);
 }
 
-#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)
-
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index acab58c4804..3b5fabe8dae 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -660,9 +660,6 @@ xfs_write(
 			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 
-		if (ioflags & IO_ISAIO)
-			return XFS_ERROR(-ENOSYS);
-
 		if ((pos & target->pbr_smask) || (count & target->pbr_smask))
 			return XFS_ERROR(-EINVAL);
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index f197a720e39..6294dcdb797 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -70,9 +70,10 @@ struct xfs_iomap;
 #define	XFS_SENDFILE_ENTER	21
 #define	XFS_WRITEPAGE_ENTER	22
 #define	XFS_RELEASEPAGE_ENTER	23
-#define	XFS_IOMAP_ALLOC_ENTER	24
-#define	XFS_IOMAP_ALLOC_MAP	25
-#define	XFS_IOMAP_UNWRITTEN	26
+#define	XFS_INVALIDPAGE_ENTER	24
+#define	XFS_IOMAP_ALLOC_ENTER	25
+#define	XFS_IOMAP_ALLOC_MAP	26
+#define	XFS_IOMAP_UNWRITTEN	27
 extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
 				void *, size_t, loff_t, int);
 extern void xfs_inval_cached_trace(struct xfs_iocore *,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f6dd7de2592..0da87bfc999 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -70,11 +70,15 @@
 #include <linux/namei.h>
 #include <linux/init.h>
 #include <linux/mount.h>
+#include <linux/mempool.h>
 #include <linux/writeback.h>
+#include <linux/kthread.h>
 
 STATIC struct quotactl_ops linvfs_qops;
 STATIC struct super_operations linvfs_sops;
-STATIC kmem_zone_t *linvfs_inode_zone;
+STATIC kmem_zone_t *xfs_vnode_zone;
+STATIC kmem_zone_t *xfs_ioend_zone;
+mempool_t *xfs_ioend_pool;
 
 STATIC struct xfs_mount_args *
 xfs_args_allocate(
@@ -138,24 +142,25 @@ STATIC __inline__ void
 xfs_set_inodeops(
 	struct inode		*inode)
 {
-	vnode_t			*vp = LINVFS_GET_VP(inode);
-
-	if (vp->v_type == VNON) {
-		vn_mark_bad(vp);
-	} else if (S_ISREG(inode->i_mode)) {
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
 		inode->i_op = &linvfs_file_inode_operations;
 		inode->i_fop = &linvfs_file_operations;
 		inode->i_mapping->a_ops = &linvfs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
+		break;
+	case S_IFDIR:
 		inode->i_op = &linvfs_dir_inode_operations;
 		inode->i_fop = &linvfs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
+		break;
+	case S_IFLNK:
 		inode->i_op = &linvfs_symlink_inode_operations;
 		if (inode->i_blocks)
 			inode->i_mapping->a_ops = &linvfs_aops;
-	} else {
+		break;
+	default:
 		inode->i_op = &linvfs_file_inode_operations;
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
 	}
 }
 
@@ -167,16 +172,23 @@ xfs_revalidate_inode(
 {
 	struct inode		*inode = LINVFS_GET_IP(vp);
 
-	inode->i_mode	= (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type);
+	inode->i_mode	= ip->i_d.di_mode;
 	inode->i_nlink	= ip->i_d.di_nlink;
 	inode->i_uid	= ip->i_d.di_uid;
 	inode->i_gid	= ip->i_d.di_gid;
-	if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_rdev =
+			MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+			      sysv_minor(ip->i_df.if_u2.if_rdev));
+		break;
+	default:
 		inode->i_rdev = 0;
-	} else {
-		xfs_dev_t dev = ip->i_df.if_u2.if_rdev;
-		inode->i_rdev = MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+		break;
 	}
+
 	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_generation = ip->i_d.di_gen;
 	i_size_write(inode, ip->i_d.di_size);
@@ -231,7 +243,6 @@ xfs_initialize_vnode(
 	 * finish our work.
 	 */
 	if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) {
-		vp->v_type = IFTOVT(ip->i_d.di_mode);
 		xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
 		xfs_set_inodeops(inode);
 	
@@ -274,8 +285,7 @@ linvfs_alloc_inode(
 {
 	vnode_t			*vp;
 
-	vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_zone, 
-                kmem_flags_convert(KM_SLEEP));
+	vp = kmem_cache_alloc(xfs_vnode_zone, kmem_flags_convert(KM_SLEEP));
 	if (!vp)
 		return NULL;
 	return LINVFS_GET_IP(vp);
@@ -285,11 +295,11 @@ STATIC void
 linvfs_destroy_inode(
 	struct inode		*inode)
 {
-	kmem_cache_free(linvfs_inode_zone, LINVFS_GET_VP(inode));
+	kmem_zone_free(xfs_vnode_zone, LINVFS_GET_VP(inode));
 }
 
 STATIC void
-init_once(
+linvfs_inode_init_once(
 	void			*data,
 	kmem_cache_t		*cachep,
 	unsigned long		flags)
@@ -302,21 +312,41 @@ init_once(
 }
 
 STATIC int
-init_inodecache( void )
+linvfs_init_zones(void)
 {
-	linvfs_inode_zone = kmem_cache_create("linvfs_icache",
+	xfs_vnode_zone = kmem_cache_create("xfs_vnode",
 				sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT,
-				init_once, NULL);
-	if (linvfs_inode_zone == NULL)
-		return -ENOMEM;
+				linvfs_inode_init_once, NULL);
+	if (!xfs_vnode_zone)
+		goto out;
+
+	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+	if (!xfs_ioend_zone)
+		goto out_destroy_vnode_zone;
+
+	xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE,
+			mempool_alloc_slab, mempool_free_slab,
+			xfs_ioend_zone);
+	if (!xfs_ioend_pool)
+		goto out_free_ioend_zone;
+
 	return 0;
+
+
+ out_free_ioend_zone:
+	kmem_zone_destroy(xfs_ioend_zone);
+ out_destroy_vnode_zone:
+	kmem_zone_destroy(xfs_vnode_zone);
+ out:
+	return -ENOMEM;
 }
 
 STATIC void
-destroy_inodecache( void )
+linvfs_destroy_zones(void)
 {
-	if (kmem_cache_destroy(linvfs_inode_zone))
-		printk(KERN_WARNING "%s: cache still in use!\n", __FUNCTION__);
+	mempool_destroy(xfs_ioend_pool);
+	kmem_zone_destroy(xfs_vnode_zone);
+	kmem_zone_destroy(xfs_ioend_zone);
 }
 
 /*
@@ -354,17 +384,38 @@ linvfs_clear_inode(
 	struct inode		*inode)
 {
 	vnode_t			*vp = LINVFS_GET_VP(inode);
+	int			error, cache;
 
-	if (vp) {
-		vn_rele(vp);
-		vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
-		/*
-		 * Do all our cleanup, and remove this vnode.
-		 */
-		vn_remove(vp);
+	vn_trace_entry(vp, "clear_inode", (inst_t *)__return_address);
+
+	XFS_STATS_INC(vn_rele);
+	XFS_STATS_INC(vn_remove);
+	XFS_STATS_INC(vn_reclaim);
+	XFS_STATS_DEC(vn_active);
+
+	/*
+	 * This can happen because xfs_iget_core calls xfs_idestroy if we
+	 * find an inode with di_mode == 0 but without IGET_CREATE set.
+	 */
+	if (vp->v_fbhv)
+		VOP_INACTIVE(vp, NULL, cache);
+
+	VN_LOCK(vp);
+	vp->v_flag &= ~VMODIFIED;
+	VN_UNLOCK(vp, 0);
+
+	if (vp->v_fbhv) {
+		VOP_RECLAIM(vp, error);
+		if (error)
+			panic("vn_purge: cannot reclaim");
 	}
-}
 
+	ASSERT(vp->v_fbhv == NULL);
+
+#ifdef XFS_VNODE_TRACE
+	ktrace_free(vp->v_trace);
+#endif
+}
 
 /*
  * Enqueue a work item to be picked up by the vfs xfssyncd thread.
@@ -466,25 +517,16 @@ xfssyncd(
 {
 	long			timeleft;
 	vfs_t			*vfsp = (vfs_t *) arg;
-	struct list_head	tmp;
 	struct vfs_sync_work	*work, *n;
+	LIST_HEAD		(tmp);
 
-	daemonize("xfssyncd");
-
-	vfsp->vfs_sync_work.w_vfs = vfsp;
-	vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
-	vfsp->vfs_sync_task = current;
-	wmb();
-	wake_up(&vfsp->vfs_wait_sync_task);
-
-	INIT_LIST_HEAD(&tmp);
 	timeleft = (xfs_syncd_centisecs * HZ) / 100;
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		timeleft = schedule_timeout(timeleft);
 		/* swsusp */
 		try_to_freeze();
-		if (vfsp->vfs_flag & VFS_UMOUNT)
+		if (kthread_should_stop())
 			break;
 
 		spin_lock(&vfsp->vfs_sync_lock);
@@ -513,10 +555,6 @@ xfssyncd(
 		}
 	}
 
-	vfsp->vfs_sync_task = NULL;
-	wmb();
-	wake_up(&vfsp->vfs_wait_sync_task);
-
 	return 0;
 }
 
@@ -524,13 +562,11 @@ STATIC int
 linvfs_start_syncd(
 	vfs_t			*vfsp)
 {
-	int			pid;
-
-	pid = kernel_thread(xfssyncd, (void *) vfsp,
-			CLONE_VM | CLONE_FS | CLONE_FILES);
-	if (pid < 0)
-		return -pid;
-	wait_event(vfsp->vfs_wait_sync_task, vfsp->vfs_sync_task);
+	vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
+	vfsp->vfs_sync_work.w_vfs = vfsp;
+	vfsp->vfs_sync_task = kthread_run(xfssyncd, vfsp, "xfssyncd");
+	if (IS_ERR(vfsp->vfs_sync_task))
+		return -PTR_ERR(vfsp->vfs_sync_task);
 	return 0;
 }
 
@@ -538,11 +574,7 @@ STATIC void
 linvfs_stop_syncd(
 	vfs_t			*vfsp)
 {
-	vfsp->vfs_flag |= VFS_UMOUNT;
-	wmb();
-
-	wake_up_process(vfsp->vfs_sync_task);
-	wait_event(vfsp->vfs_wait_sync_task, !vfsp->vfs_sync_task);
+	kthread_stop(vfsp->vfs_sync_task);
 }
 
 STATIC void
@@ -866,9 +898,9 @@ init_xfs_fs( void )
 
 	ktrace_init(64);
 
-	error = init_inodecache();
+	error = linvfs_init_zones();
 	if (error < 0)
-		goto undo_inodecache;
+		goto undo_zones;
 
 	error = pagebuf_init();
 	if (error < 0)
@@ -889,9 +921,9 @@ undo_register:
 	pagebuf_terminate();
 
 undo_pagebuf:
-	destroy_inodecache();
+	linvfs_destroy_zones();
 
-undo_inodecache:
+undo_zones:
 	return error;
 }
 
@@ -903,7 +935,7 @@ exit_xfs_fs( void )
 	unregister_filesystem(&xfs_fs_type);
 	xfs_cleanup();
 	pagebuf_terminate();
-	destroy_inodecache();
+	linvfs_destroy_zones();
 	ktrace_uninit();
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 669c6164495..34cc902ec11 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -251,7 +251,6 @@ vfs_allocate( void )
 	bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
 	INIT_LIST_HEAD(&vfsp->vfs_sync_list);
 	spin_lock_init(&vfsp->vfs_sync_lock);
-	init_waitqueue_head(&vfsp->vfs_wait_sync_task);
 	init_waitqueue_head(&vfsp->vfs_wait_single_sync_task);
 	return vfsp;
 }
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 7ee1f714e9b..f0ab574fb47 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -65,7 +65,6 @@ typedef struct vfs {
 	spinlock_t		vfs_sync_lock;	/* work item list lock */
 	int 			vfs_sync_seq;	/* sync thread generation no. */
 	wait_queue_head_t	vfs_wait_single_sync_task;
-	wait_queue_head_t	vfs_wait_sync_task;
 } vfs_t;
 
 #define vfs_fbhv		vfs_bh.bh_first	/* 1st on vfs behavior chain */
@@ -96,7 +95,6 @@ typedef enum {
 #define VFS_RDONLY		0x0001	/* read-only vfs */
 #define VFS_GRPID		0x0002	/* group-ID assigned from directory */
 #define VFS_DMI			0x0004	/* filesystem has the DMI enabled */
-#define VFS_UMOUNT		0x0008	/* unmount in progress */
 #define VFS_END			0x0008	/* max flag */
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 250cad54e89..268f45bf6a9 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -42,93 +42,33 @@ DEFINE_SPINLOCK(vnumber_lock);
  */
 #define NVSYNC                  37
 #define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-sv_t vsync[NVSYNC];
-
-/*
- * Translate stat(2) file types to vnode types and vice versa.
- * Aware of numeric order of S_IFMT and vnode type values.
- */
-enum vtype iftovt_tab[] = {
-	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
-	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
-};
-
-u_short vttoif_tab[] = {
-	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
-};
+STATIC wait_queue_head_t vsync[NVSYNC];
 
 
 void
 vn_init(void)
 {
-	register sv_t *svp;
-	register int i;
+	int i;
 
-	for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
-		init_sv(svp, SV_DEFAULT, "vsy", i);
+	for (i = 0; i < NVSYNC; i++)
+		init_waitqueue_head(&vsync[i]);
 }
 
-/*
- * Clean a vnode of filesystem-specific data and prepare it for reuse.
- */
-STATIC int
-vn_reclaim(
+void
+vn_iowait(
 	struct vnode	*vp)
 {
-	int		error;
+	wait_queue_head_t *wq = vptosync(vp);
 
-	XFS_STATS_INC(vn_reclaim);
-	vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);
-
-	/*
-	 * Only make the VOP_RECLAIM call if there are behaviors
-	 * to call.
-	 */
-	if (vp->v_fbhv) {
-		VOP_RECLAIM(vp, error);
-		if (error)
-			return -error;
-	}
-	ASSERT(vp->v_fbhv == NULL);
-
-	VN_LOCK(vp);
-	vp->v_flag &= (VRECLM|VWAIT);
-	VN_UNLOCK(vp, 0);
-
-	vp->v_type = VNON;
-	vp->v_fbhv = NULL;
-
-#ifdef XFS_VNODE_TRACE
-	ktrace_free(vp->v_trace);
-	vp->v_trace = NULL;
-#endif
-
-	return 0;
-}
-
-STATIC void
-vn_wakeup(
-	struct vnode	*vp)
-{
-	VN_LOCK(vp);
-	if (vp->v_flag & VWAIT)
-		sv_broadcast(vptosync(vp));
-	vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED);
-	VN_UNLOCK(vp, 0);
+	wait_event(*wq, (atomic_read(&vp->v_iocount) == 0));
 }
 
-int
-vn_wait(
+void
+vn_iowake(
 	struct vnode	*vp)
 {
-	VN_LOCK(vp);
-	if (vp->v_flag & (VINACT | VRECLM)) {
-		vp->v_flag |= VWAIT;
-		sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
-		return 1;
-	}
-	VN_UNLOCK(vp, 0);
-	return 0;
+	if (atomic_dec_and_test(&vp->v_iocount))
+		wake_up(vptosync(vp));
 }
 
 struct vnode *
@@ -154,6 +94,8 @@ vn_initialize(
 	/* Initialize the first behavior and the behavior chain head. */
 	vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
 
+	atomic_set(&vp->v_iocount, 0);
+
 #ifdef	XFS_VNODE_TRACE
 	vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
 #endif	/* XFS_VNODE_TRACE */
@@ -163,30 +105,6 @@ vn_initialize(
 }
 
 /*
- * Get a reference on a vnode.
- */
-vnode_t *
-vn_get(
-	struct vnode	*vp,
-	vmap_t		*vmap)
-{
-	struct inode	*inode;
-
-	XFS_STATS_INC(vn_get);
-	inode = LINVFS_GET_IP(vp);
-	if (inode->i_state & I_FREEING)
-		return NULL;
-
-	inode = ilookup(vmap->v_vfsp->vfs_super, vmap->v_ino);
-	if (!inode)	/* Inode not present */
-		return NULL;
-
-	vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
-
-	return vp;
-}
-
-/*
  * Revalidate the Linux inode from the vattr.
  * Note: i_size _not_ updated; we must hold the inode
  * semaphore when doing that - callers responsibility.
@@ -198,7 +116,7 @@ vn_revalidate_core(
 {
 	struct inode	*inode = LINVFS_GET_IP(vp);
 
-	inode->i_mode	    = VTTOIF(vap->va_type) | vap->va_mode;
+	inode->i_mode	    = vap->va_mode;
 	inode->i_nlink	    = vap->va_nlink;
 	inode->i_uid	    = vap->va_uid;
 	inode->i_gid	    = vap->va_gid;
@@ -247,71 +165,6 @@ vn_revalidate(
 }
 
 /*
- * purge a vnode from the cache
- * At this point the vnode is guaranteed to have no references (vn_count == 0)
- * The caller has to make sure that there are no ways someone could
- * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
- */
-void
-vn_purge(
-	struct vnode	*vp,
-	vmap_t		*vmap)
-{
-	vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
-
-again:
-	/*
-	 * Check whether vp has already been reclaimed since our caller
-	 * sampled its version while holding a filesystem cache lock that
-	 * its VOP_RECLAIM function acquires.
-	 */
-	VN_LOCK(vp);
-	if (vp->v_number != vmap->v_number) {
-		VN_UNLOCK(vp, 0);
-		return;
-	}
-
-	/*
-	 * If vp is being reclaimed or inactivated, wait until it is inert,
-	 * then proceed.  Can't assume that vnode is actually reclaimed
-	 * just because the reclaimed flag is asserted -- a vn_alloc
-	 * reclaim can fail.
-	 */
-	if (vp->v_flag & (VINACT | VRECLM)) {
-		ASSERT(vn_count(vp) == 0);
-		vp->v_flag |= VWAIT;
-		sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
-		goto again;
-	}
-
-	/*
-	 * Another process could have raced in and gotten this vnode...
-	 */
-	if (vn_count(vp) > 0) {
-		VN_UNLOCK(vp, 0);
-		return;
-	}
-
-	XFS_STATS_DEC(vn_active);
-	vp->v_flag |= VRECLM;
-	VN_UNLOCK(vp, 0);
-
-	/*
-	 * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
-	 * vp's filesystem to flush and invalidate all cached resources.
-	 * When vn_reclaim returns, vp should have no private data,
-	 * either in a system cache or attached to v_data.
-	 */
-	if (vn_reclaim(vp) != 0)
-		panic("vn_purge: cannot reclaim");
-
-	/*
-	 * Wakeup anyone waiting for vp to be reclaimed.
-	 */
-	vn_wakeup(vp);
-}
-
-/*
  * Add a reference to a referenced vnode.
  */
 struct vnode *
@@ -330,80 +183,6 @@ vn_hold(
 	return vp;
 }
 
-/*
- *  Call VOP_INACTIVE on last reference.
- */
-void
-vn_rele(
-	struct vnode	*vp)
-{
-	int		vcnt;
-	int		cache;
-
-	XFS_STATS_INC(vn_rele);
-
-	VN_LOCK(vp);
-
-	vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address);
-	vcnt = vn_count(vp);
-
-	/*
-	 * Since we always get called from put_inode we know
-	 * that i_count won't be decremented after we
-	 * return.
-	 */
-	if (!vcnt) {
-		/*
-		 * As soon as we turn this on, noone can find us in vn_get
-		 * until we turn off VINACT or VRECLM
-		 */
-		vp->v_flag |= VINACT;
-		VN_UNLOCK(vp, 0);
-
-		/*
-		 * Do not make the VOP_INACTIVE call if there
-		 * are no behaviors attached to the vnode to call.
-		 */
-		if (vp->v_fbhv)
-			VOP_INACTIVE(vp, NULL, cache);
-
-		VN_LOCK(vp);
-		if (vp->v_flag & VWAIT)
-			sv_broadcast(vptosync(vp));
-
-		vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED);
-	}
-
-	VN_UNLOCK(vp, 0);
-
-	vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address);
-}
-
-/*
- * Finish the removal of a vnode.
- */
-void
-vn_remove(
-	struct vnode	*vp)
-{
-	vmap_t		vmap;
-
-	/* Make sure we don't do this to the same vnode twice */
-	if (!(vp->v_fbhv))
-		return;
-
-	XFS_STATS_INC(vn_remove);
-	vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address);
-
-	/*
-	 * After the following purge the vnode
-	 * will no longer exist.
-	 */
-	VMAP(vp, vmap);
-	vn_purge(vp, &vmap);
-}
-
-
 #ifdef	XFS_VNODE_TRACE
 
 #define KTRACE_ENTER(vp, vk, s, line, ra)			\
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index a6e57c647be..35f306cebb8 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -65,10 +65,6 @@ struct vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
-/*
- * Vnode types.  VNON means no type.
- */
-enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VFIFO, VBAD, VSOCK };
 
 typedef xfs_ino_t vnumber_t;
 typedef struct dentry vname_t;
@@ -77,15 +73,14 @@ typedef bhv_head_t vn_bhv_head_t;
 /*
  * MP locking protocols:
  *	v_flag, v_vfsp				VN_LOCK/VN_UNLOCK
- *	v_type					read-only or fs-dependent
  */
 typedef struct vnode {
 	__u32		v_flag;			/* vnode flags (see below) */
-	enum vtype	v_type;			/* vnode type */
 	struct vfs	*v_vfsp;		/* ptr to containing VFS */
 	vnumber_t	v_number;		/* in-core vnode number */
 	vn_bhv_head_t	v_bh;			/* behavior head */
 	spinlock_t	v_lock;			/* VN_LOCK/VN_UNLOCK */
+	atomic_t	v_iocount;		/* outstanding I/O count */
 #ifdef XFS_VNODE_TRACE
 	struct ktrace	*v_trace;		/* trace header structure    */
 #endif
@@ -93,6 +88,12 @@ typedef struct vnode {
 	/* inode MUST be last */
 } vnode_t;
 
+#define VN_ISLNK(vp)	S_ISLNK((vp)->v_inode.i_mode)
+#define VN_ISREG(vp)	S_ISREG((vp)->v_inode.i_mode)
+#define VN_ISDIR(vp)	S_ISDIR((vp)->v_inode.i_mode)
+#define VN_ISCHR(vp)	S_ISCHR((vp)->v_inode.i_mode)
+#define VN_ISBLK(vp)	S_ISBLK((vp)->v_inode.i_mode)
+
 #define v_fbhv			v_bh.bh_first	       /* first behavior */
 #define v_fops			v_bh.bh_first->bd_ops  /* first behavior ops */
 
@@ -133,22 +134,8 @@ typedef enum {
 #define LINVFS_GET_IP(vp)	(&(vp)->v_inode)
 
 /*
- * Convert between vnode types and inode formats (since POSIX.1
- * defines mode word of stat structure in terms of inode formats).
- */
-extern enum vtype	iftovt_tab[];
-extern u_short		vttoif_tab[];
-#define IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
-#define VTTOIF(indx)	(vttoif_tab[(int)(indx)])
-#define MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
-
-
-/*
  * Vnode flags.
  */
-#define VINACT		       0x1	/* vnode is being inactivated	*/
-#define VRECLM		       0x2	/* vnode is being reclaimed	*/
-#define VWAIT		       0x4	/* waiting for VINACT/VRECLM to end */
 #define VMODIFIED	       0x8	/* XFS inode state possibly differs */
 					/* to the Linux inode state.	*/
 
@@ -408,7 +395,6 @@ typedef struct vnodeops {
  */
 typedef struct vattr {
 	int		va_mask;	/* bit-mask of attributes present */
-	enum vtype	va_type;	/* vnode type (for create) */
 	mode_t		va_mode;	/* file access mode and type */
 	xfs_nlink_t	va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
@@ -498,27 +484,12 @@ typedef struct vattr {
  * Check whether mandatory file locking is enabled.
  */
 #define MANDLOCK(vp, mode)	\
-	((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
+	(VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
 
 extern void	vn_init(void);
-extern int	vn_wait(struct vnode *);
 extern vnode_t	*vn_initialize(struct inode *);
 
 /*
- * Acquiring and invalidating vnodes:
- *
- *	if (vn_get(vp, version, 0))
- *		...;
- *	vn_purge(vp, version);
- *
- * vn_get and vn_purge must be called with vmap_t arguments, sampled
- * while a lock that the vnode's VOP_RECLAIM function acquires is
- * held, to ensure that the vnode sampled with the lock held isn't
- * recycled (VOP_RECLAIMed) or deallocated between the release of the lock
- * and the subsequent vn_get or vn_purge.
- */
-
-/*
  * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
  */
 typedef struct vnode_map {
@@ -531,11 +502,11 @@ typedef struct vnode_map {
 			 (vmap).v_number = (vp)->v_number,	\
 			 (vmap).v_ino	 = (vp)->v_inode.i_ino; }
 
-extern void	vn_purge(struct vnode *, vmap_t *);
-extern vnode_t	*vn_get(struct vnode *, vmap_t *);
 extern int	vn_revalidate(struct vnode *);
 extern void	vn_revalidate_core(struct vnode *, vattr_t *);
-extern void	vn_remove(struct vnode *);
+
+extern void	vn_iowait(struct vnode *vp);
+extern void	vn_iowake(struct vnode *vp);
 
 static inline int vn_count(struct vnode *vp)
 {
@@ -546,7 +517,6 @@ static inline int vn_count(struct vnode *vp)
  * Vnode reference counting functions (and macros for compatibility).
  */
 extern vnode_t	*vn_hold(struct vnode *);
-extern void	vn_rele(struct vnode *);
 
 #if defined(XFS_VNODE_TRACE)
 #define VN_HOLD(vp)		\
@@ -560,6 +530,12 @@ extern void	vn_rele(struct vnode *);
 #define VN_RELE(vp)		(iput(LINVFS_GET_IP(vp)))
 #endif
 
+static inline struct vnode *vn_grab(struct vnode *vp)
+{
+	struct inode *inode = igrab(LINVFS_GET_IP(vp));
+	return inode ? LINVFS_GET_VP(inode) : NULL;
+}
+
 /*
  * Vname handling macros.
  */
diff --git a/fs/xfs/quota/Makefile b/fs/xfs/quota/Makefile
new file mode 100644
index 00000000000..7a4f725b282
--- /dev/null
+++ b/fs/xfs/quota/Makefile
@@ -0,0 +1 @@
+include $(TOPDIR)/fs/xfs/quota/Makefile-linux-$(VERSION).$(PATCHLEVEL)
diff --git a/fs/xfs/quota/Makefile-linux-2.6 b/fs/xfs/quota/Makefile-linux-2.6
new file mode 100644
index 00000000000..93e60e83935
--- /dev/null
+++ b/fs/xfs/quota/Makefile-linux-2.6
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+
+EXTRA_CFLAGS += -I $(TOPDIR)/fs/xfs -I $(TOPDIR)/fs/xfs/linux-2.6
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -g -DDEBUG
+	#EXTRA_CFLAGS += -DQUOTADEBUG
+endif
+ifeq ($(CONFIG_XFS_TRACE),y)
+	EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
+	EXTRA_CFLAGS += -DXFS_VNODE_TRACE
+endif
+
+xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o
+
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)		+= xfs_qm_stats.o
+endif
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 46ce1e3ce1d..e2e8d35fa4d 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -421,7 +421,7 @@ xfs_qm_init_dquot_blk(
  */
 STATIC int
 xfs_qm_dqalloc(
-	xfs_trans_t	*tp,
+	xfs_trans_t	**tpp,
 	xfs_mount_t	*mp,
 	xfs_dquot_t	*dqp,
 	xfs_inode_t	*quotip,
@@ -433,6 +433,7 @@ xfs_qm_dqalloc(
 	xfs_bmbt_irec_t map;
 	int		nmaps, error, committed;
 	xfs_buf_t	*bp;
+	xfs_trans_t	*tp = *tpp;
 
 	ASSERT(tp != NULL);
 	xfs_dqtrace_entry(dqp, "DQALLOC");
@@ -492,10 +493,32 @@ xfs_qm_dqalloc(
 	xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
 			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
 
-	if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) {
+	/*
+	 * xfs_bmap_finish() may commit the current transaction and
+	 * start a second transaction if the freelist is not empty.
+	 *
+	 * Since we still want to modify this buffer, we need to
+	 * ensure that the buffer is not released on commit of
+	 * the first transaction and ensure the buffer is added to the
+	 * second transaction.
+	 *
+	 * If there is only one transaction then don't stop the buffer
+	 * from being released when it commits later on.
+	 */
+
+	xfs_trans_bhold(tp, bp);
+
+	if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) {
 		goto error1;
 	}
 
+	if (committed) {
+		tp = *tpp;
+		xfs_trans_bjoin(tp, bp);
+	} else {
+		xfs_trans_bhold_release(tp, bp);
+	}
+
 	*O_bpp = bp;
 	return 0;
 
@@ -514,7 +537,7 @@ xfs_qm_dqalloc(
  */
 STATIC int
 xfs_qm_dqtobp(
-	xfs_trans_t		*tp,
+	xfs_trans_t		**tpp,
 	xfs_dquot_t		*dqp,
 	xfs_disk_dquot_t	**O_ddpp,
 	xfs_buf_t		**O_bpp,
@@ -528,6 +551,7 @@ xfs_qm_dqtobp(
 	xfs_disk_dquot_t *ddq;
 	xfs_dqid_t	id;
 	boolean_t	newdquot;
+	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
 
 	mp = dqp->q_mount;
 	id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
@@ -579,9 +603,10 @@ xfs_qm_dqtobp(
 				return (ENOENT);
 
 			ASSERT(tp);
-			if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip,
+			if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
 						dqp->q_fileoffset, &bp)))
 				return (error);
+			tp = *tpp;
 			newdquot = B_TRUE;
 		} else {
 			/*
@@ -645,7 +670,7 @@ xfs_qm_dqtobp(
 /* ARGSUSED */
 STATIC int
 xfs_qm_dqread(
-	xfs_trans_t	*tp,
+	xfs_trans_t	**tpp,
 	xfs_dqid_t	id,
 	xfs_dquot_t	*dqp,	/* dquot to get filled in */
 	uint		flags)
@@ -653,15 +678,19 @@ xfs_qm_dqread(
 	xfs_disk_dquot_t *ddqp;
 	xfs_buf_t	 *bp;
 	int		 error;
+	xfs_trans_t	 *tp;
+
+	ASSERT(tpp);
 
 	/*
 	 * get a pointer to the on-disk dquot and the buffer containing it
 	 * dqp already knows its own type (GROUP/USER).
 	 */
 	xfs_dqtrace_entry(dqp, "DQREAD");
-	if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) {
+	if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
 		return (error);
 	}
+	tp = *tpp;
 
 	/* copy everything from disk dquot to the incore dquot */
 	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
@@ -740,7 +769,7 @@ xfs_qm_idtodq(
 	 * Read it from disk; xfs_dqread() takes care of
 	 * all the necessary initialization of dquot's fields (locks, etc)
 	 */
-	if ((error = xfs_qm_dqread(tp, id, dqp, flags))) {
+	if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
 		/*
 		 * This can happen if quotas got turned off (ESRCH),
 		 * or if the dquot didn't exist on disk and we ask to
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 39175103c8e..8ebc87176c7 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -113,20 +113,6 @@ typedef struct xfs_dquot {
 
 #define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
 
-/*
- * Quota Accounting/Enforcement flags
- */
-#define XFS_ALL_QUOTA_ACCT	\
-		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
-
-#define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_QUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
-#define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
-
 #ifdef DEBUG
 static inline int
 XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f5271b7b1e8..e74eaa7dd1b 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -509,6 +509,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
 
 	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
 	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
 	qf->qql_format.qf_size = 1;
 }
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f665ca8f9e9..efde16e0a91 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -365,16 +365,6 @@ xfs_qm_mount_quotas(
 	int		error = 0;
 	uint		sbf;
 
-	/*
-	 * If a file system had quotas running earlier, but decided to
-	 * mount without -o uquota/pquota/gquota options, revoke the
-	 * quotachecked license, and bail out.
-	 */
-	if (! XFS_IS_QUOTA_ON(mp) &&
-	    (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT)) {
-		mp->m_qflags = 0;
-		goto write_changes;
-	}
 
 	/*
 	 * If quotas on realtime volumes is not supported, we disable
@@ -388,11 +378,8 @@ xfs_qm_mount_quotas(
 		goto write_changes;
 	}
 
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-	cmn_err(CE_NOTE, "Attempting to turn on disk quotas.");
-#endif
-
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
 	/*
 	 * Allocate the quotainfo structure inside the mount struct, and
 	 * create quotainode(s), and change/rev superblock if necessary.
@@ -410,19 +397,14 @@ xfs_qm_mount_quotas(
 	 */
 	if (XFS_QM_NEED_QUOTACHECK(mp) &&
 		!(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-#ifdef DEBUG
-		cmn_err(CE_NOTE, "Doing a quotacheck. Please wait.");
-#endif
 		if ((error = xfs_qm_quotacheck(mp))) {
 			/* Quotacheck has failed and quotas have
 			 * been disabled.
 			 */
 			return XFS_ERROR(error);
 		}
-#ifdef DEBUG
-		cmn_err(CE_NOTE, "Done quotacheck.");
-#endif
 	}
+
  write_changes:
 	/*
 	 * We actually don't have to acquire the SB_LOCK at all.
@@ -2010,7 +1992,7 @@ xfs_qm_quotacheck(
 		ASSERT(mp->m_quotainfo != NULL);
 		ASSERT(xfs_Gqm != NULL);
 		xfs_qm_destroy_quotainfo(mp);
-		xfs_mount_reset_sbqflags(mp);
+		(void)xfs_mount_reset_sbqflags(mp);
 	} else {
 		cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
 	}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index b03eecf3b6c..0b00b3c6701 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -184,8 +184,6 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
 
-extern void		xfs_mount_reset_sbqflags(xfs_mount_t *);
-
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern int		xfs_qm_mount_quotas(xfs_mount_t *, int);
 extern void		xfs_qm_mount_quotainit(xfs_mount_t *, uint);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index dc3c37a1e15..8890a18a99d 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -229,48 +229,6 @@ xfs_qm_syncall(
 	return error;
 }
 
-/*
- * Clear the quotaflags in memory and in the superblock.
- */
-void
-xfs_mount_reset_sbqflags(
-	xfs_mount_t		*mp)
-{
-	xfs_trans_t		*tp;
-	unsigned long		s;
-
-	mp->m_qflags = 0;
-	/*
-	 * It is OK to look at sb_qflags here in mount path,
-	 * without SB_LOCK.
-	 */
-	if (mp->m_sb.sb_qflags == 0)
-		return;
-	s = XFS_SB_LOCK(mp);
-	mp->m_sb.sb_qflags = 0;
-	XFS_SB_UNLOCK(mp, s);
-
-	/*
-	 * if the fs is readonly, let the incore superblock run
-	 * with quotas off but don't flush the update out to disk
-	 */
-	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
-		return;
-#ifdef QUOTADEBUG
-	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
-#endif
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-				      XFS_DEFAULT_LOG_COUNT)) {
-		xfs_trans_cancel(tp, 0);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_mount_reset_sbqflags: Superblock update failed!");
-		return;
-	}
-	xfs_mod_sb(tp, XFS_SB_QFLAGS);
-	xfs_trans_commit(tp, 0, NULL);
-}
-
 STATIC int
 xfs_qm_newmount(
 	xfs_mount_t	*mp,
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68e98962dbe..15e02e8a9d4 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1053,7 +1053,6 @@ xfs_qm_dqrele_all_inodes(
 	struct xfs_mount *mp,
 	uint		 flags)
 {
-	vmap_t		vmap;
 	xfs_inode_t	*ip, *topino;
 	uint		ireclaims;
 	vnode_t		*vp;
@@ -1061,8 +1060,8 @@ xfs_qm_dqrele_all_inodes(
 
 	ASSERT(mp->m_quotainfo);
 
-again:
 	XFS_MOUNT_ILOCK(mp);
+again:
 	ip = mp->m_inodes;
 	if (ip == NULL) {
 		XFS_MOUNT_IUNLOCK(mp);
@@ -1090,18 +1089,14 @@ again:
 		}
 		vnode_refd = B_FALSE;
 		if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-			/*
-			 * Sample vp mapping while holding the mplock, lest
-			 * we come across a non-existent vnode.
-			 */
-			VMAP(vp, vmap);
 			ireclaims = mp->m_ireclaims;
 			topino = mp->m_inodes;
-			XFS_MOUNT_IUNLOCK(mp);
+			vp = vn_grab(vp);
+			if (!vp)
+				goto again;
 
+			XFS_MOUNT_IUNLOCK(mp);
 			/* XXX restart limit ? */
-			if ( ! (vp = vn_get(vp, &vmap)))
-				goto again;
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
 			vnode_refd = B_TRUE;
 		} else {
@@ -1137,7 +1132,6 @@ again:
 		 */
 		if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
 			/* XXX use a sentinel */
-			XFS_MOUNT_IUNLOCK(mp);
 			goto again;
 		}
 		ip = ip->i_mnext;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 4ed7b6928cd..4e1a5ec22fa 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -31,6 +31,7 @@
  */
 
 #include "debug.h"
+#include "spin.h"
 
 #include <asm/page.h>
 #include <linux/sched.h>
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 3dae14c8c55..fa8394f9437 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -170,7 +170,7 @@ ktrace_enter(
 	void            *val14,
 	void            *val15)
 {
-	static lock_t   wrap_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(wrap_lock);
 	unsigned long	flags;
 	int             index;
 	ktrace_entry_t  *ktep;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8d01dce8c53..92fd1d67f87 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -85,7 +85,7 @@ xfs_acl_vhasacl_default(
 {
 	int		error;
 
-	if (vp->v_type != VDIR)
+	if (!VN_ISDIR(vp))
 		return 0;
 	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
 	return (error == 0);
@@ -389,7 +389,7 @@ xfs_acl_allow_set(
 
 	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
 		return EPERM;
-	if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
+	if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp))
 		return ENOTDIR;
 	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 		return EROFS;
@@ -750,7 +750,7 @@ xfs_acl_inherit(
 	 * If the new file is a directory, its default ACL is a copy of
 	 * the containing directory's default ACL.
 	 */
-	if (vp->v_type == VDIR)
+	if (VN_ISDIR(vp))
 		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
 	if (!error && !basicperms)
 		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index ae35189b3d7..5ab0dd885b1 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -40,22 +40,28 @@
 
 #include <asm/byteorder.h>
 
-#ifdef __LITTLE_ENDIAN
-# define __BYTE_ORDER	__LITTLE_ENDIAN
-#endif
 #ifdef __BIG_ENDIAN
-# define __BYTE_ORDER	__BIG_ENDIAN
+#define	XFS_NATIVE_HOST	1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#else /* __KERNEL__ */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define	XFS_NATIVE_HOST	1
+#else
+#undef XFS_NATIVE_HOST
 #endif
 
 #endif	/* __KERNEL__ */
 
 /* do we need conversion? */
-
 #define ARCH_NOCONVERT 1
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-# define ARCH_CONVERT	0
-#else
+#ifdef XFS_NATIVE_HOST
 # define ARCH_CONVERT	ARCH_NOCONVERT
+#else
+# define ARCH_CONVERT	0
 #endif
 
 /* generic swapping macros */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6f5d283888a..3e76def1283 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4754,10 +4754,20 @@ xfs_bmapi(
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FDBLOCKS,
 							-(alen), rsvd);
-				if (!error)
+				if (!error) {
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FDBLOCKS,
 							-(indlen), rsvd);
+					if (error && rt) {
+						xfs_mod_incore_sb(ip->i_mount,
+							XFS_SBS_FREXTENTS,
+							extsz, rsvd);
+					} else if (error) {
+						xfs_mod_incore_sb(ip->i_mount,
+							XFS_SBS_FDBLOCKS,
+							alen, rsvd);
+					}
+				}
 
 				if (error) {
 					if (XFS_IS_QUOTA_ON(ip->i_mount))
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 09c413576ba..09a77b17565 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -2017,7 +2017,7 @@ xfs_bmbt_get_state(
 				ext_flag);
 }
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 /* Endian flipping versions of the bmbt extraction functions */
 void
 xfs_bmbt_disk_get_all(
@@ -2087,7 +2087,7 @@ xfs_bmbt_disk_get_state(
 	return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r),
 				ext_flag);
 }
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 
 /*
@@ -2531,7 +2531,7 @@ xfs_bmbt_set_allf(
 #endif	/* XFS_BIG_BLKNOS */
 }
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 /*
  * Set all the fields in a bmap extent record from the uncompressed form.
  */
@@ -2617,7 +2617,7 @@ xfs_bmbt_disk_set_allf(
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 /*
  * Set the blockcount field in a bmap extent record.
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0a40cf126c2..2cf4fe45cbc 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -62,7 +62,7 @@ typedef struct xfs_bmdr_block
  *  l1:0-20 are blockcount.
  */
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#ifndef XFS_NATIVE_HOST
 
 #define BMBT_TOTAL_BITLEN	128	/* 128 bits, 16 bytes */
 #define BMBT_EXNTFLAG_BITOFF	0
@@ -87,7 +87,7 @@ typedef struct xfs_bmdr_block
 #define BMBT_BLOCKCOUNT_BITOFF	64 /* Start of second 64 bit container */
 #define BMBT_BLOCKCOUNT_BITLEN	21
 
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 
 #define BMBT_USE_64	1
@@ -505,7 +505,7 @@ xfs_exntst_t
 xfs_bmbt_get_state(
 	xfs_bmbt_rec_t	*r);
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 void
 xfs_bmbt_disk_get_all(
 	xfs_bmbt_rec_t	*r,
@@ -538,7 +538,7 @@ xfs_bmbt_disk_get_startoff(
 	xfs_bmbt_get_blockcount(r)
 #define xfs_bmbt_disk_get_startoff(r) \
 	xfs_bmbt_get_startoff(r)
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 int
 xfs_bmbt_increment(
@@ -623,7 +623,7 @@ xfs_bmbt_set_state(
 	xfs_bmbt_rec_t	*r,
 	xfs_exntst_t	v);
 
-#if __BYTE_ORDER != __BIG_ENDIAN
+#ifndef XFS_NATIVE_HOST
 void
 xfs_bmbt_disk_set_all(
 	xfs_bmbt_rec_t	*r,
@@ -641,7 +641,7 @@ xfs_bmbt_disk_set_allf(
 	xfs_bmbt_set_all(r, s)
 #define xfs_bmbt_disk_set_allf(r, o, b, c, v) \
 	xfs_bmbt_set_allf(r, o, b, c, v)
-#endif
+#endif /* XFS_NATIVE_HOST */
 
 void
 xfs_bmbt_to_bmdr(
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 30b8285ad47..a264657acfd 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -274,6 +274,7 @@ xfs_buf_item_format(
 		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
 	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
 	vecp->i_len = base_size;
+	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
 	vecp++;
 	nvecs = 1;
 
@@ -320,12 +321,14 @@ xfs_buf_item_format(
 			buffer_offset = first_bit * XFS_BLI_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 			nvecs++;
 			break;
 		} else if (next_bit != last_bit + 1) {
 			buffer_offset = first_bit * XFS_BLI_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 			nvecs++;
 			vecp++;
 			first_bit = next_bit;
@@ -337,6 +340,7 @@ xfs_buf_item_format(
 			buffer_offset = first_bit * XFS_BLI_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 /* You would think we need to bump the nvecs here too, but we do not
  * this number is used by recovery, and it gets confused by the boundary
  * split here
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index dd423ce1bc8..480bffc1f29 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -127,13 +127,13 @@ typedef union {
 	 * Watch the order here (endian-ness dependent).
 	 */
 	struct {
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#ifndef XFS_NATIVE_HOST
 		xfs_dahash_t	h;	/* hash value */
 		__uint32_t	be;	/* block and entry */
-#else	/* __BYTE_ORDER == __BIG_ENDIAN */
+#else
 		__uint32_t	be;	/* block and entry */
 		xfs_dahash_t	h;	/* hash value */
-#endif	/* __BYTE_ORDER == __BIG_ENDIAN */
+#endif /* XFS_NATIVE_HOST */
 	} s;
 } xfs_dircook_t;
 
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 55c17adaaa3..19e872856f6 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index db7cbd1bc85..cc7d1494a45 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -107,6 +107,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 
 	log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
 	log_vector->i_len = size;
+	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT);
 	ASSERT(size >= sizeof(xfs_efi_log_format_t));
 }
 
@@ -426,6 +427,7 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
 
 	log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
 	log_vector->i_len = size;
+	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT);
 	ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
 
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d3da00045f2..0d9ae8fb413 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -30,6 +30,8 @@
  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  */
 
+#include <linux/delay.h>
+
 #include "xfs.h"
 
 #include "xfs_macros.h"
@@ -505,17 +507,15 @@ xfs_iget(
 	vnode_t		*vp = NULL;
 	int		error;
 
-retry:
 	XFS_STATS_INC(xs_ig_attempts);
 
+retry:
 	if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
 		bhv_desc_t	*bdp;
 		xfs_inode_t	*ip;
-		int		newnode;
 
 		vp = LINVFS_GET_VP(inode);
 		if (inode->i_state & I_NEW) {
-inode_allocate:
 			vn_initialize(inode);
 			error = xfs_iget_core(vp, mp, tp, ino, flags,
 					lock_flags, ipp, bno);
@@ -526,32 +526,25 @@ inode_allocate:
 				iput(inode);
 			}
 		} else {
-			/* These are true if the inode is in inactive or
-			 * reclaim. The linux inode is about to go away,
-			 * wait for that path to finish, and try again.
+			/*
+			 * If the inode is not fully constructed due to
+			 * filehandle mistmatches wait for the inode to go
+			 * away and try again.
+			 *
+			 * iget_locked will call __wait_on_freeing_inode
+			 * to wait for the inode to go away.
 			 */
-			if (vp->v_flag & (VINACT | VRECLM)) {
-				vn_wait(vp);
+			if (is_bad_inode(inode) ||
+			    ((bdp = vn_bhv_lookup(VN_BHV_HEAD(vp),
+						  &xfs_vnodeops)) == NULL)) {
 				iput(inode);
+				delay(1);
 				goto retry;
 			}
 
-			if (is_bad_inode(inode)) {
-				iput(inode);
-				return EIO;
-			}
-
-			bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
-			if (bdp == NULL) {
-				XFS_STATS_INC(xs_ig_dup);
-				goto inode_allocate;
-			}
 			ip = XFS_BHVTOI(bdp);
 			if (lock_flags != 0)
 				xfs_ilock(ip, lock_flags);
-			newnode = (ip->i_d.di_mode == 0);
-			if (newnode)
-				xfs_iocore_inode_reinit(ip);
 			XFS_STATS_INC(xs_ig_found);
 			*ipp = ip;
 			error = 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34bdf590968..db43308aae9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1128,7 +1128,6 @@ xfs_ialloc(
 	ASSERT(ip != NULL);
 
 	vp = XFS_ITOV(ip);
-	vp->v_type = IFTOVT(mode);
 	ip->i_d.di_mode = (__uint16_t)mode;
 	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
@@ -1250,7 +1249,7 @@ xfs_ialloc(
 	 */
 	xfs_trans_log_inode(tp, ip, flags);
 
-	/* now that we have a v_type we can set Linux inode ops (& unlock) */
+	/* now that we have an i_mode  we can set Linux inode ops (& unlock) */
 	VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
 
 	*ipp = ip;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0eed30f5cb1..50e2cadf909 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -248,6 +248,7 @@ xfs_inode_item_format(
 
 	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
 	vecp->i_len  = sizeof(xfs_inode_log_format_t);
+	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);
 	vecp++;
 	nvecs	     = 1;
 
@@ -292,6 +293,7 @@ xfs_inode_item_format(
 
 	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
 	vecp->i_len  = sizeof(xfs_dinode_core_t);
+	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
 	vecp++;
 	nvecs++;
 	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -339,7 +341,7 @@ xfs_inode_item_format(
 			nrecs = ip->i_df.if_bytes /
 				(uint)sizeof(xfs_bmbt_rec_t);
 			ASSERT(nrecs > 0);
-#if __BYTE_ORDER == __BIG_ENDIAN
+#ifdef XFS_NATIVE_HOST
 			if (nrecs == ip->i_d.di_nextents) {
 				/*
 				 * There are no delayed allocation
@@ -349,6 +351,7 @@ xfs_inode_item_format(
 				vecp->i_addr =
 					(char *)(ip->i_df.if_u1.if_extents);
 				vecp->i_len = ip->i_df.if_bytes;
+				XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
 			} else
 #endif
 			{
@@ -367,6 +370,7 @@ xfs_inode_item_format(
 				vecp->i_addr = (xfs_caddr_t)ext_buffer;
 				vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 						XFS_DATA_FORK);
+				XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
 			}
 			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
 			iip->ili_format.ilf_dsize = vecp->i_len;
@@ -384,6 +388,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_df.if_broot != NULL);
 			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
 			vecp->i_len = ip->i_df.if_broot_bytes;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -409,6 +414,7 @@ xfs_inode_item_format(
 			ASSERT((ip->i_df.if_real_bytes == 0) ||
 			       (ip->i_df.if_real_bytes == data_bytes));
 			vecp->i_len = (int)data_bytes;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -467,7 +473,7 @@ xfs_inode_item_format(
 #endif
 			ASSERT(nrecs > 0);
 			ASSERT(nrecs == ip->i_d.di_anextents);
-#if __BYTE_ORDER == __BIG_ENDIAN
+#ifdef XFS_NATIVE_HOST
 			/*
 			 * There are not delayed allocation extents
 			 * for attributes, so just point at the array.
@@ -486,6 +492,7 @@ xfs_inode_item_format(
 			vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 					XFS_ATTR_FORK);
 #endif
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);
 			iip->ili_format.ilf_asize = vecp->i_len;
 			vecp++;
 			nvecs++;
@@ -500,6 +507,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_afp->if_broot != NULL);
 			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
 			vecp->i_len = ip->i_afp->if_broot_bytes;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -523,6 +531,7 @@ xfs_inode_item_format(
 			ASSERT((ip->i_afp->if_real_bytes == 0) ||
 			       (ip->i_afp->if_real_bytes == data_bytes));
 			vecp->i_len = (int)data_bytes;
+			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_asize = (unsigned)data_bytes;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2edd6769e5d..d0f5be63cdd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -226,13 +226,12 @@ xfs_iomap(
 		xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);
 		lockmode = XFS_LCK_MAP_SHARED(mp, io);
 		bmapi_flags = XFS_BMAPI_ENTIRE;
-		if (flags & BMAPI_IGNSTATE)
-			bmapi_flags |= XFS_BMAPI_IGSTATE;
 		break;
 	case BMAPI_WRITE:
 		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);
 		lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
-		bmapi_flags = 0;
+		if (flags & BMAPI_IGNSTATE)
+			bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
 		XFS_ILOCK(mp, io, lockmode);
 		break;
 	case BMAPI_ALLOCATE:
@@ -391,9 +390,9 @@ xfs_iomap_write_direct(
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
 	xfs_bmap_free_t free_list;
 	int		aeof;
-	xfs_filblks_t	datablocks, qblocks, resblks;
+	xfs_filblks_t	qblocks, resblks;
 	int		committed;
-	int		numrtextents;
+	int		resrtextents;
 
 	/*
 	 * Make sure that the dquots are there. This doesn't hold
@@ -434,14 +433,14 @@ xfs_iomap_write_direct(
 
 		if (!(extsz = ip->i_d.di_extsize))
 			extsz = mp->m_sb.sb_rextsize;
-		numrtextents = qblocks = (count_fsb + extsz - 1);
-		do_div(numrtextents, mp->m_sb.sb_rextsize);
+		resrtextents = qblocks = (count_fsb + extsz - 1);
+		do_div(resrtextents, mp->m_sb.sb_rextsize);
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
 		quota_flag = XFS_QMOPT_RES_RTBLKS;
-		datablocks = 0;
 	} else {
-		datablocks = qblocks = count_fsb;
+		resrtextents = 0;
+		resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, count_fsb);
 		quota_flag = XFS_QMOPT_RES_REGBLKS;
-		numrtextents = 0;
 	}
 
 	/*
@@ -449,9 +448,8 @@ xfs_iomap_write_direct(
 	 */
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-	resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
 	error = xfs_trans_reserve(tp, resblks,
-			XFS_WRITE_LOG_RES(mp), numrtextents,
+			XFS_WRITE_LOG_RES(mp), resrtextents,
 			XFS_TRANS_PERM_LOG_RES,
 			XFS_WRITE_LOG_COUNT);
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1cd2ac16387..54a6f114240 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -159,11 +159,15 @@ xfs_buftarg_t *xlog_target;
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
-	if (! log->l_grant_trace) {
-		log->l_grant_trace = ktrace_alloc(1024, KM_NOSLEEP);
-		if (! log->l_grant_trace)
+	unsigned long cnts;
+
+	if (!log->l_grant_trace) {
+		log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
+		if (!log->l_grant_trace)
 			return;
 	}
+	/* ticket counts are 1 byte each */
+	cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
 
 	ktrace_enter(log->l_grant_trace,
 		     (void *)tic,
@@ -178,10 +182,10 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 		     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
 		     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
 		     (void *)string,
-		     (void *)((unsigned long)13),
-		     (void *)((unsigned long)14),
-		     (void *)((unsigned long)15),
-		     (void *)((unsigned long)16));
+		     (void *)((unsigned long)tic->t_trans_type),
+		     (void *)cnts,
+		     (void *)((unsigned long)tic->t_curr_res),
+		     (void *)((unsigned long)tic->t_unit_res));
 }
 
 void
@@ -274,9 +278,11 @@ xfs_log_done(xfs_mount_t	*mp,
 		 * Release ticket if not permanent reservation or a specifc
 		 * request has been made to release a permanent reservation.
 		 */
+		xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
 		xlog_ungrant_log_space(log, ticket);
 		xlog_state_put_ticket(log, ticket);
 	} else {
+		xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
 		xlog_regrant_reserve_log_space(log, ticket);
 	}
 
@@ -399,7 +405,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 		int		 cnt,
 		xfs_log_ticket_t *ticket,
 		__uint8_t	 client,
-		uint		 flags)
+		uint		 flags,
+		uint		 t_type)
 {
 	xlog_t		*log = mp->m_log;
 	xlog_ticket_t	*internal_ticket;
@@ -421,13 +428,19 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 	if (*ticket != NULL) {
 		ASSERT(flags & XFS_LOG_PERM_RESERV);
 		internal_ticket = (xlog_ticket_t *)*ticket;
+		xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)");
 		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
 		retval = xlog_regrant_write_log_space(log, internal_ticket);
 	} else {
 		/* may sleep if need to allocate more tickets */
 		internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
 						  client, flags);
+		internal_ticket->t_trans_type = t_type;
 		*ticket = internal_ticket;
+		xlog_trace_loggrant(log, internal_ticket, 
+			(internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ?
+			"xfs_log_reserve: create new ticket (permanent trans)" :
+			"xfs_log_reserve: create new ticket");
 		xlog_grant_push_ail(mp,
 				    (internal_ticket->t_unit_res *
 				     internal_ticket->t_cnt));
@@ -601,8 +614,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	if (! (XLOG_FORCED_SHUTDOWN(log))) {
 		reg[0].i_addr = (void*)&magic;
 		reg[0].i_len  = sizeof(magic);
+		XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
 
-		error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+		error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0);
 		if (!error) {
 			/* remove inited flag */
 			((xlog_ticket_t *)tic)->t_flags = 0;
@@ -1272,6 +1286,7 @@ xlog_commit_record(xfs_mount_t  *mp,
 
 	reg[0].i_addr = NULL;
 	reg[0].i_len = 0;
+	XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT);
 
 	ASSERT_ALWAYS(iclog);
 	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1605,6 +1620,117 @@ xlog_state_finish_copy(xlog_t		*log,
 
 
 /*
+ * print out info relating to regions written which consume
+ * the reservation
+ */
+#if defined(XFS_LOG_RES_DEBUG)
+STATIC void
+xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+{
+	uint i;
+	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
+
+	/* match with XLOG_REG_TYPE_* in xfs_log.h */
+	static char *res_type_str[XLOG_REG_TYPE_MAX] = {
+	    "bformat",
+	    "bchunk",
+	    "efi_format",
+	    "efd_format",
+	    "iformat",
+	    "icore",
+	    "iext",
+	    "ibroot",
+	    "ilocal",
+	    "iattr_ext",
+	    "iattr_broot",
+	    "iattr_local",
+	    "qformat",
+	    "dquot",
+	    "quotaoff",
+	    "LR header",
+	    "unmount",
+	    "commit",
+	    "trans header"
+	};
+	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
+	    "SETATTR_NOT_SIZE",
+	    "SETATTR_SIZE",
+	    "INACTIVE",
+	    "CREATE",
+	    "CREATE_TRUNC",
+	    "TRUNCATE_FILE",
+	    "REMOVE",
+	    "LINK",
+	    "RENAME",
+	    "MKDIR",
+	    "RMDIR",
+	    "SYMLINK",
+	    "SET_DMATTRS",
+	    "GROWFS",
+	    "STRAT_WRITE",
+	    "DIOSTRAT",
+	    "WRITE_SYNC",
+	    "WRITEID",
+	    "ADDAFORK",
+	    "ATTRINVAL",
+	    "ATRUNCATE",
+	    "ATTR_SET",
+	    "ATTR_RM",
+	    "ATTR_FLAG",
+	    "CLEAR_AGI_BUCKET",
+	    "QM_SBCHANGE",
+	    "DUMMY1",
+	    "DUMMY2",
+	    "QM_QUOTAOFF",
+	    "QM_DQALLOC",
+	    "QM_SETQLIM",
+	    "QM_DQCLUSTER",
+	    "QM_QINOCREATE",
+	    "QM_QUOTAOFF_END",
+	    "SB_UNIT",
+	    "FSYNC_TS",
+	    "GROWFSRT_ALLOC",
+	    "GROWFSRT_ZERO",
+	    "GROWFSRT_FREE",
+	    "SWAPEXT"
+	};
+
+	xfs_fs_cmn_err(CE_WARN, mp,
+			"xfs_log_write: reservation summary:\n"
+			"  trans type  = %s (%u)\n"
+			"  unit res    = %d bytes\n"
+			"  current res = %d bytes\n"
+			"  total reg   = %u bytes (o/flow = %u bytes)\n"
+			"  ophdrs      = %u (ophdr space = %u bytes)\n"
+			"  ophdr + reg = %u bytes\n"
+			"  num regions = %u\n",
+			((ticket->t_trans_type <= 0 ||
+			  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+			  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+			ticket->t_trans_type,
+			ticket->t_unit_res,
+			ticket->t_curr_res,
+			ticket->t_res_arr_sum, ticket->t_res_o_flow,
+			ticket->t_res_num_ophdrs, ophdr_spc,
+			ticket->t_res_arr_sum + 
+			  ticket->t_res_o_flow + ophdr_spc,
+			ticket->t_res_num);
+
+	for (i = 0; i < ticket->t_res_num; i++) {
+	   	uint r_type = ticket->t_res_arr[i].r_type; 
+		cmn_err(CE_WARN,
+			    "region[%u]: %s - %u bytes\n",
+			    i, 
+			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
+			    "bad-rtype" : res_type_str[r_type-1]),
+			    ticket->t_res_arr[i].r_len);
+	}
+}
+#else
+#define xlog_print_tic_res(mp, ticket)
+#endif
+
+/*
  * Write some region out to in-core log
  *
  * This will be called when writing externally provided regions or when
@@ -1677,16 +1803,21 @@ xlog_write(xfs_mount_t *	mp,
      * xlog_op_header_t and may need to be double word aligned.
      */
     len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED)     /* acct for start rec of xact */
+    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
 	len += sizeof(xlog_op_header_t);
+	XLOG_TIC_ADD_OPHDR(ticket);
+    }
 
     for (index = 0; index < nentries; index++) {
 	len += sizeof(xlog_op_header_t);	    /* each region gets >= 1 */
+	XLOG_TIC_ADD_OPHDR(ticket);
 	len += reg[index].i_len;
+	XLOG_TIC_ADD_REGION(ticket, reg[index].i_len, reg[index].i_type);
     }
     contwr = *start_lsn = 0;
 
     if (ticket->t_curr_res < len) {
+	xlog_print_tic_res(mp, ticket);
 #ifdef DEBUG
 	xlog_panic(
 		"xfs_log_write: reservation ran out. Need to up reservation");
@@ -1790,6 +1921,7 @@ xlog_write(xfs_mount_t *	mp,
 		len += sizeof(xlog_op_header_t); /* from splitting of region */
 		/* account for new log op header */
 		ticket->t_curr_res -= sizeof(xlog_op_header_t);
+		XLOG_TIC_ADD_OPHDR(ticket);
 	    }
 	    xlog_verify_dest_ptr(log, ptr);
 
@@ -2282,6 +2414,9 @@ restart:
 	 */
 	if (log_offset == 0) {
 		ticket->t_curr_res -= log->l_iclog_hsize;
+		XLOG_TIC_ADD_REGION(ticket,
+				    log->l_iclog_hsize,
+				    XLOG_REG_TYPE_LRHEADER);
 		INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
 		ASSIGN_LSN(head->h_lsn, log);
 		ASSERT(log->l_curr_block >= 0);
@@ -2468,6 +2603,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 #endif
 
 	tic->t_curr_res = tic->t_unit_res;
+	XLOG_TIC_RESET_RES(tic);
 
 	if (tic->t_cnt > 0)
 		return (0);
@@ -2608,6 +2744,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
 	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
 	ticket->t_curr_res = ticket->t_unit_res;
+	XLOG_TIC_RESET_RES(ticket);
 	xlog_trace_loggrant(log, ticket,
 			    "xlog_regrant_reserve_log_space: sub current res");
 	xlog_verify_grant_head(log, 1);
@@ -2624,6 +2761,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 	xlog_verify_grant_head(log, 0);
 	GRANT_UNLOCK(log, s);
 	ticket->t_curr_res = ticket->t_unit_res;
+	XLOG_TIC_RESET_RES(ticket);
 }	/* xlog_regrant_reserve_log_space */
 
 
@@ -3179,29 +3317,57 @@ xlog_ticket_get(xlog_t		*log,
 	 * and their unit amount is the total amount of space required.
 	 *
 	 * The following lines of code account for non-transaction data
-	 * which occupy space in the on-disk log. 
+	 * which occupy space in the on-disk log.
+	 *
+	 * Normal form of a transaction is:
+	 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
+	 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
+	 *
+	 * We need to account for all the leadup data and trailer data
+	 * around the transaction data.
+	 * And then we need to account for the worst case in terms of using
+	 * more space.
+	 * The worst case will happen if:
+	 * - the placement of the transaction happens to be such that the
+	 *   roundoff is at its maximum
+	 * - the transaction data is synced before the commit record is synced
+	 *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
+	 *   Therefore the commit record is in its own Log Record.
+	 *   This can happen as the commit record is called with its
+	 *   own region to xlog_write().
+	 *   This then means that in the worst case, roundoff can happen for
+	 *   the commit-rec as well.
+	 *   The commit-rec is smaller than padding in this scenario and so it is
+	 *   not added separately.
 	 */
 
+	/* for trans header */
+	unit_bytes += sizeof(xlog_op_header_t);
+	unit_bytes += sizeof(xfs_trans_header_t);
+
 	/* for start-rec */
-	unit_bytes += sizeof(xlog_op_header_t); 
+	unit_bytes += sizeof(xlog_op_header_t);
+
+	/* for LR headers */
+	num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+	unit_bytes += log->l_iclog_hsize * num_headers;
+
+	/* for commit-rec LR header - note: padding will subsume the ophdr */
+	unit_bytes += log->l_iclog_hsize;
+
+	/* for split-recs - ophdrs added when data split over LRs */
+	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
 
-	/* for padding */
+	/* for roundoff padding for transaction data and one for commit record */
 	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
-		log->l_mp->m_sb.sb_logsunit > 1) {
+	    log->l_mp->m_sb.sb_logsunit > 1) {
 		/* log su roundoff */
-		unit_bytes += log->l_mp->m_sb.sb_logsunit;  
+		unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
 	} else {
 		/* BB roundoff */
-		unit_bytes += BBSIZE;
+		unit_bytes += 2*BBSIZE;
         }
 
-	/* for commit-rec */
-	unit_bytes += sizeof(xlog_op_header_t);
- 
-	/* for LR headers */
-	num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
-	unit_bytes += log->l_iclog_hsize * num_headers;
-
 	tic->t_unit_res		= unit_bytes;
 	tic->t_curr_res		= unit_bytes;
 	tic->t_cnt		= cnt;
@@ -3209,10 +3375,13 @@ xlog_ticket_get(xlog_t		*log,
 	tic->t_tid		= (xlog_tid_t)((__psint_t)tic & 0xffffffff);
 	tic->t_clientid		= client;
 	tic->t_flags		= XLOG_TIC_INITED;
+	tic->t_trans_type	= 0;
 	if (xflags & XFS_LOG_PERM_RESERV)
 		tic->t_flags |= XLOG_TIC_PERM_RESERV;
 	sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
 
+	XLOG_TIC_RESET_RES(tic);
+
 	return tic;
 }	/* xlog_ticket_get */
 
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 0db122ddda3..18961119fc6 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -114,9 +114,44 @@ xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XFS_VOLUME		0x2
 #define XFS_LOG			0xaa
 
+
+/* Region types for iovec's i_type */
+#if defined(XFS_LOG_RES_DEBUG)
+#define XLOG_REG_TYPE_BFORMAT		1
+#define XLOG_REG_TYPE_BCHUNK		2
+#define XLOG_REG_TYPE_EFI_FORMAT	3
+#define XLOG_REG_TYPE_EFD_FORMAT	4
+#define XLOG_REG_TYPE_IFORMAT		5
+#define XLOG_REG_TYPE_ICORE		6
+#define XLOG_REG_TYPE_IEXT		7
+#define XLOG_REG_TYPE_IBROOT		8
+#define XLOG_REG_TYPE_ILOCAL		9
+#define XLOG_REG_TYPE_IATTR_EXT		10
+#define XLOG_REG_TYPE_IATTR_BROOT	11
+#define XLOG_REG_TYPE_IATTR_LOCAL	12
+#define XLOG_REG_TYPE_QFORMAT		13
+#define XLOG_REG_TYPE_DQUOT		14
+#define XLOG_REG_TYPE_QUOTAOFF		15
+#define XLOG_REG_TYPE_LRHEADER		16
+#define XLOG_REG_TYPE_UNMOUNT		17
+#define XLOG_REG_TYPE_COMMIT		18
+#define XLOG_REG_TYPE_TRANSHDR		19
+#define XLOG_REG_TYPE_MAX		19
+#endif
+
+#if defined(XFS_LOG_RES_DEBUG)
+#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
+#else
+#define XLOG_VEC_SET_TYPE(vecp, t)
+#endif
+
+
 typedef struct xfs_log_iovec {
 	xfs_caddr_t		i_addr;		/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
+#if defined(XFS_LOG_RES_DEBUG)
+ 	uint		i_type;		/* type of region */
+#endif
 } xfs_log_iovec_t;
 
 typedef void* xfs_log_ticket_t;
@@ -159,7 +194,8 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
 			  int		   count,
 			  xfs_log_ticket_t *ticket,
 			  __uint8_t	   clientid,
-			  uint		   flags);
+			  uint		   flags,
+			  uint		   t_type);
 int	  xfs_log_write(struct xfs_mount *mp,
 			xfs_log_iovec_t  region[],
 			int		 nentries,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1a1d452f15f..a884cea82fc 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -112,7 +112,7 @@ struct xfs_mount;
  * this has endian issues, of course.
  */
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#ifndef XFS_NATIVE_HOST
 #define GET_CLIENT_ID(i,arch) \
     ((i) & 0xff)
 #else
@@ -335,18 +335,66 @@ typedef __uint32_t xlog_tid_t;
 
 #define XLOG_COVER_OPS		5
 
+
+/* Ticket reservation region accounting */ 
+#if defined(XFS_LOG_RES_DEBUG)
+#define XLOG_TIC_LEN_MAX	15
+#define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \
+				(t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0)
+#define XLOG_TIC_ADD_OPHDR(t) ((t)->t_res_num_ophdrs++)
+#define XLOG_TIC_ADD_REGION(t, len, type)				\
+	do {								\
+		if ((t)->t_res_num == XLOG_TIC_LEN_MAX) { 		\
+			/* add to overflow and start again */		\
+			(t)->t_res_o_flow += (t)->t_res_arr_sum;	\
+			(t)->t_res_num = 0;				\
+			(t)->t_res_arr_sum = 0;				\
+		}							\
+		(t)->t_res_arr[(t)->t_res_num].r_len = (len);		\
+		(t)->t_res_arr[(t)->t_res_num].r_type = (type);		\
+		(t)->t_res_arr_sum += (len);				\
+		(t)->t_res_num++;					\
+	} while (0)
+
+/*
+ * Reservation region
+ * As would be stored in xfs_log_iovec but without the i_addr which
+ * we don't care about.
+ */
+typedef struct xlog_res {
+	uint	r_len;
+	uint	r_type;
+} xlog_res_t;
+#else
+#define XLOG_TIC_RESET_RES(t)
+#define XLOG_TIC_ADD_OPHDR(t)
+#define XLOG_TIC_ADD_REGION(t, len, type)
+#endif
+
+
 typedef struct xlog_ticket {
-	sv_t		   t_sema;	 /* sleep on this semaphore	 :20 */
-	struct xlog_ticket *t_next;	 /*			         : 4 */
-	struct xlog_ticket *t_prev;	 /*				 : 4 */
-	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4 */
-	int		   t_curr_res;	 /* current reservation in bytes : 4 */
-	int		   t_unit_res;	 /* unit reservation in bytes    : 4 */
-	__uint8_t	   t_ocnt;	 /* original count		 : 1 */
-	__uint8_t	   t_cnt;	 /* current count		 : 1 */
-	__uint8_t	   t_clientid;	 /* who does this belong to;	 : 1 */
-	__uint8_t	   t_flags;	 /* properties of reservation	 : 1 */
+	sv_t		   t_sema;	 /* sleep on this semaphore      : 20 */
+ 	struct xlog_ticket *t_next;	 /*			         :4|8 */
+	struct xlog_ticket *t_prev;	 /*				 :4|8 */
+	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
+	int		   t_curr_res;	 /* current reservation in bytes : 4  */
+	int		   t_unit_res;	 /* unit reservation in bytes    : 4  */
+	char		   t_ocnt;	 /* original count		 : 1  */
+	char		   t_cnt;	 /* current count		 : 1  */
+	char		   t_clientid;	 /* who does this belong to;	 : 1  */
+	char		   t_flags;	 /* properties of reservation	 : 1  */
+	uint		   t_trans_type; /* transaction type             : 4  */
+
+#if defined (XFS_LOG_RES_DEBUG)
+        /* reservation array fields */
+	uint		   t_res_num;                    /* num in array : 4 */
+	xlog_res_t	   t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : X */ 
+	uint		   t_res_num_ophdrs;		 /* num op hdrs  : 4 */
+	uint		   t_res_arr_sum;		 /* array sum    : 4 */
+	uint		   t_res_o_flow;		 /* sum overflow : 4 */
+#endif
 } xlog_ticket_t;
+
 #endif
 
 
@@ -366,14 +414,10 @@ typedef struct xlog_op_header {
 #define XLOG_FMT_IRIX_BE  3
 
 /* our fmt */
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define XLOG_FMT XLOG_FMT_LINUX_LE
-#else
-#if __BYTE_ORDER == __BIG_ENDIAN
+#ifdef XFS_NATIVE_HOST
 #define XLOG_FMT XLOG_FMT_LINUX_BE
 #else
-#error unknown byte order
-#endif
+#define XLOG_FMT XLOG_FMT_LINUX_LE
 #endif
 
 typedef struct xlog_rec_header {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0aac28ddb81..14faabaabf2 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1387,7 +1387,7 @@ xlog_recover_add_to_cont_trans(
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
 	memcpy(&ptr[old_len], dp, len); /* d, s, l */
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 4f40c92863d..a6cd6324e94 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -42,7 +42,8 @@
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
-
+#include "xfs_quota.h"
+#include "xfs_error.h"
 
 STATIC struct xfs_dquot *
 xfs_dqvopchown_default(
@@ -54,8 +55,79 @@ xfs_dqvopchown_default(
 	return NULL;
 }
 
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(xfs_mount_t *mp)
+{
+	int			error;
+	xfs_trans_t		*tp;
+	unsigned long		s;
+
+	mp->m_qflags = 0;
+	/*
+	 * It is OK to look at sb_qflags here in mount path,
+	 * without SB_LOCK.
+	 */
+	if (mp->m_sb.sb_qflags == 0)
+		return 0;
+	s = XFS_SB_LOCK(mp);
+	mp->m_sb.sb_qflags = 0;
+	XFS_SB_UNLOCK(mp, s);
+
+	/*
+	 * if the fs is readonly, let the incore superblock run
+	 * with quotas off but don't flush the update out to disk
+	 */
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return 0;
+#ifdef QUOTADEBUG
+	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_mount_reset_sbqflags: Superblock update failed!");
+		return error;
+	}
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+	error = xfs_trans_commit(tp, 0, NULL);
+	return error;
+}
+
+STATIC int
+xfs_noquota_init(
+	xfs_mount_t	*mp,
+	uint		*needquotamount,
+	uint		*quotaflags)
+{
+	int		error = 0;
+
+	*quotaflags = 0;
+	*needquotamount = B_FALSE;
+
+	ASSERT(!XFS_IS_QUOTA_ON(mp));
+
+	/*
+	 * If a file system had quotas running earlier, but decided to
+	 * mount without -o uquota/pquota/gquota options, revoke the
+	 * quotachecked license.
+	 */
+	if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+		cmn_err(CE_NOTE,
+                        "XFS resetting qflags for filesystem %s",
+                        mp->m_fsname);
+
+		error = xfs_mount_reset_sbqflags(mp);
+	}
+	return error;
+}
+
 xfs_qmops_t	xfs_qmcore_stub = {
-	.xfs_qminit		= (xfs_qminit_t) fs_noerr,
+	.xfs_qminit		= (xfs_qminit_t) xfs_noquota_init,
 	.xfs_qmdone		= (xfs_qmdone_t) fs_noerr,
 	.xfs_qmmount		= (xfs_qmmount_t) fs_noerr,
 	.xfs_qmunmount		= (xfs_qmunmount_t) fs_noerr,
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 7134576ae7f..32cb79752d5 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -160,6 +160,20 @@ typedef struct xfs_qoff_logformat {
 #define XFS_GQUOTA_ACCT	0x0040  /* group quota accounting ON */
 
 /*
+ * Quota Accounting/Enforcement flags
+ */
+#define XFS_ALL_QUOTA_ACCT	\
+		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+
+#define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_QUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
+#define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
+
+/*
  * Incore only flags for quotaoff - these bits get cleared when quota(s)
  * are in the process of getting turned off. These flags are in m_qflags but
  * never in sb_qflags.
@@ -362,6 +376,7 @@ typedef struct xfs_dqtrxops {
 				f | XFS_QMOPT_RES_REGBLKS)
 
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
 extern struct bhv_vfsops xfs_qmops;
 
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06dfca531f7..92efe272b83 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -276,7 +276,7 @@ xfs_trans_reserve(
 
 		error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
 					&tp->t_ticket,
-					XFS_TRANSACTION, log_flags);
+					XFS_TRANSACTION, log_flags, tp->t_type);
 		if (error) {
 			goto undo_blocks;
 		}
@@ -1032,6 +1032,7 @@ xfs_trans_fill_vecs(
 	tp->t_header.th_num_items = nitems;
 	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
 	log_vector->i_len = sizeof(xfs_trans_header_t);
+	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR);
 }
 
 
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ec541d66fa2..a263aec8b3a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -112,6 +112,7 @@ typedef struct xfs_trans_header {
 #define	XFS_TRANS_GROWFSRT_ZERO		38
 #define	XFS_TRANS_GROWFSRT_FREE		39
 #define	XFS_TRANS_SWAPEXT		40
+#define	XFS_TRANS_TYPE_MAX		40
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
 
@@ -998,6 +999,7 @@ struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
 void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 7bc5eab4c2c..2a71b4f91bf 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -379,8 +379,8 @@ xfs_trans_delete_ail(
 		else {
 			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
 				"xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
-			xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
 			AIL_UNLOCK(mp, s);
+			xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
 		}
 	}
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 144da7a8546..e733293dd7f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -714,6 +714,29 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 }
 
 /*
+ * Cancel the previous buffer hold request made on this buffer
+ * for this transaction.
+ */
+void
+xfs_trans_bhold_release(xfs_trans_t	*tp,
+			xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
+	bip->bli_flags &= ~XFS_BLI_HOLD;
+	xfs_buf_item_trace("BHOLD RELEASE", bip);
+}
+
+/*
  * This is called to mark bytes first through last inclusive of the given
  * buffer as needing to be logged when the transaction is committed.
  * The buffer must already be associated with the given transaction.
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 42bcc021520..f1a904e23ad 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -795,7 +795,6 @@ xfs_statvfs(
 	xfs_mount_t	*mp;
 	xfs_sb_t	*sbp;
 	unsigned long	s;
-	u64 id;
 
 	mp = XFS_BHVTOM(bdp);
 	sbp = &(mp->m_sb);
@@ -823,9 +822,7 @@ xfs_statvfs(
 	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
 	XFS_SB_UNLOCK(mp, s);
 
-	id = huge_encode_dev(mp->m_dev);
-	statp->f_fsid.val[0] = (u32)id;
-	statp->f_fsid.val[1] = (u32)(id >> 32);
+	xfs_statvfs_fsid(statp, mp);
 	statp->f_namelen = MAXNAMELEN - 1;
 
 	return 0;
@@ -906,7 +903,6 @@ xfs_sync_inodes(
 	xfs_inode_t	*ip_next;
 	xfs_buf_t	*bp;
 	vnode_t		*vp = NULL;
-	vmap_t		vmap;
 	int		error;
 	int		last_error;
 	uint64_t	fflag;
@@ -1101,48 +1097,21 @@ xfs_sync_inodes(
 		 * lock in xfs_ireclaim() after the inode is pulled from
 		 * the mount list will sleep until we release it here.
 		 * This keeps the vnode from being freed while we reference
-		 * it.  It is also cheaper and simpler than actually doing
-		 * a vn_get() for every inode we touch here.
+		 * it.
 		 */
 		if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-
 			if ((flags & SYNC_BDFLUSH) || (vp == NULL)) {
 				ip = ip->i_mnext;
 				continue;
 			}
 
-			/*
-			 * We need to unlock the inode list lock in order
-			 * to lock the inode. Insert a marker record into
-			 * the inode list to remember our position, dropping
-			 * the lock is now done inside the IPOINTER_INSERT
-			 * macro.
-			 *
-			 * We also use the inode list lock to protect us
-			 * in taking a snapshot of the vnode version number
-			 * for use in calling vn_get().
-			 */
-			VMAP(vp, vmap);
-			IPOINTER_INSERT(ip, mp);
-
-			vp = vn_get(vp, &vmap);
+			vp = vn_grab(vp);
 			if (vp == NULL) {
-				/*
-				 * The vnode was reclaimed once we let go
-				 * of the inode list lock.  Skip to the
-				 * next list entry. Remove the marker.
-				 */
-
-				XFS_MOUNT_ILOCK(mp);
-
-				mount_locked = B_TRUE;
-				vnode_refed  = B_FALSE;
-
-				IPOINTER_REMOVE(ip, mp);
-
+				ip = ip->i_mnext;
 				continue;
 			}
 
+			IPOINTER_INSERT(ip, mp);
 			xfs_ilock(ip, lock_flags);
 
 			ASSERT(vp == XFS_ITOV(ip));
@@ -1533,7 +1502,10 @@ xfs_syncsub(
 	 * eventually kicked out of the cache.
 	 */
 	if (flags & SYNC_REFCACHE) {
-		xfs_refcache_purge_some(mp);
+		if (flags & SYNC_WAIT)
+			xfs_refcache_purge_mp(mp);
+		else
+			xfs_refcache_purge_some(mp);
 	}
 
 	/*
@@ -1649,6 +1621,10 @@ xfs_vget(
 #define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
 #define MNTOPT_NOUUID	"nouuid"	/* ignore filesystem UUID */
 #define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
+#define MNTOPT_GRPID	"grpid"		/* group-ID from parent directory */
+#define MNTOPT_NOGRPID	"nogrpid"	/* group-ID from current process */
+#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
+#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
 #define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
 #define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
@@ -1769,6 +1745,12 @@ xfs_parseargs(
 			}
 			args->flags |= XFSMNT_IHASHSIZE;
 			args->ihashsize = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_GRPID) ||
+			   !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+			vfsp->vfs_flag |= VFS_GRPID;
+		} else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+			   !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+			vfsp->vfs_flag &= ~VFS_GRPID;
 		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
 			args->flags |= XFSMNT_WSYNC;
 		} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
@@ -1890,6 +1872,7 @@ xfs_showargs(
 	};
 	struct proc_xfs_info	*xfs_infop;
 	struct xfs_mount	*mp = XFS_BHVTOM(bhv);
+	struct vfs		*vfsp = XFS_MTOVFS(mp);
 
 	for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
 		if (mp->m_flags & xfs_infop->flag)
@@ -1926,7 +1909,10 @@ xfs_showargs(
 
 	if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT))
 		seq_printf(m, "," MNTOPT_64BITINODE);
-	
+
+	if (vfsp->vfs_flag & VFS_GRPID)
+		seq_printf(m, "," MNTOPT_GRPID);
+
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1377c868f3f..58bfe629b93 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -104,7 +104,7 @@ xfs_open(
 	 * If it's a directory with any blocks, read-ahead block 0
 	 * as we're almost certain to have the next operation be a read there.
 	 */
-	if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
+	if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
 		mode = xfs_ilock_map_shared(ip);
 		if (ip->i_d.di_nextents > 0)
 			(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
@@ -163,18 +163,21 @@ xfs_getattr(
 	/*
 	 * Copy from in-core inode.
 	 */
-	vap->va_type = vp->v_type;
-	vap->va_mode = ip->i_d.di_mode & MODEMASK;
+	vap->va_mode = ip->i_d.di_mode;
 	vap->va_uid = ip->i_d.di_uid;
 	vap->va_gid = ip->i_d.di_gid;
 	vap->va_projid = ip->i_d.di_projid;
 
 	/*
 	 * Check vnode type block/char vs. everything else.
-	 * Do it with bitmask because that's faster than looking
-	 * for multiple values individually.
 	 */
-	if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		vap->va_rdev = ip->i_df.if_u2.if_rdev;
+		vap->va_blocksize = BLKDEV_IOSIZE;
+		break;
+	default:
 		vap->va_rdev = 0;
 
 		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
@@ -224,9 +227,7 @@ xfs_getattr(
 				(ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
 				(mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
 		}
-	} else {
-		vap->va_rdev = ip->i_df.if_u2.if_rdev;
-		vap->va_blocksize = BLKDEV_IOSIZE;
+		break;
 	}
 
 	vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
@@ -468,7 +469,7 @@ xfs_setattr(
 				m |= S_ISGID;
 #if 0
 			/* Linux allows this, Irix doesn't. */
-			if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR)
+			if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 				m |= S_ISVTX;
 #endif
 			if (m && !capable(CAP_FSETID))
@@ -546,10 +547,10 @@ xfs_setattr(
 			goto error_return;
 		}
 
-		if (vp->v_type == VDIR) {
+		if (VN_ISDIR(vp)) {
 			code = XFS_ERROR(EISDIR);
 			goto error_return;
-		} else if (vp->v_type != VREG) {
+		} else if (!VN_ISREG(vp)) {
 			code = XFS_ERROR(EINVAL);
 			goto error_return;
 		}
@@ -1567,7 +1568,7 @@ xfs_release(
 	vp = BHV_TO_VNODE(bdp);
 	ip = XFS_BHVTOI(bdp);
 
-	if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
+	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
 		return 0;
 	}
 
@@ -1895,7 +1896,7 @@ xfs_create(
 	dp = XFS_BHVTOI(dir_bdp);
 	mp = dp->i_mount;
 
-	dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+	dm_di_mode = vap->va_mode;
 	namelen = VNAMELEN(dentry);
 
 	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
@@ -1973,8 +1974,7 @@ xfs_create(
 	    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
 		goto error_return;
 	rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
-	error = xfs_dir_ialloc(&tp, dp,
-			MAKEIMODE(vap->va_type,vap->va_mode), 1,
+	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
 			rdev, credp, prid, resblks > 0,
 			&ip, &committed);
 	if (error) {
@@ -2620,7 +2620,7 @@ xfs_link(
 	vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
 
 	target_namelen = VNAMELEN(dentry);
-	if (src_vp->v_type == VDIR)
+	if (VN_ISDIR(src_vp))
 		return XFS_ERROR(EPERM);
 
 	src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
@@ -2805,7 +2805,7 @@ xfs_mkdir(
 
 	tp = NULL;
 	dp_joined_to_trans = B_FALSE;
-	dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+	dm_di_mode = vap->va_mode;
 
 	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
@@ -2879,8 +2879,7 @@ xfs_mkdir(
 	/*
 	 * create the directory inode.
 	 */
-	error = xfs_dir_ialloc(&tp, dp,
-			MAKEIMODE(vap->va_type,vap->va_mode), 2,
+	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
 			0, credp, prid, resblks > 0,
 		&cdp, NULL);
 	if (error) {
@@ -3650,7 +3649,7 @@ xfs_rwlock(
 	vnode_t		*vp;
 
 	vp = BHV_TO_VNODE(bdp);
-	if (vp->v_type == VDIR)
+	if (VN_ISDIR(vp))
 		return 1;
 	ip = XFS_BHVTOI(bdp);
 	if (locktype == VRWLOCK_WRITE) {
@@ -3681,7 +3680,7 @@ xfs_rwunlock(
 	vnode_t		*vp;
 
 	vp = BHV_TO_VNODE(bdp);
-	if (vp->v_type == VDIR)
+	if (VN_ISDIR(vp))
 		return;
 	ip = XFS_BHVTOI(bdp);
 	if (locktype == VRWLOCK_WRITE) {
@@ -3847,51 +3846,10 @@ xfs_reclaim(
 		return 0;
 	}
 
-	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
-		if (ip->i_d.di_size > 0) {
-			/*
-			 * Flush and invalidate any data left around that is
-			 * a part of this file.
-			 *
-			 * Get the inode's i/o lock so that buffers are pushed
-			 * out while holding the proper lock.  We can't hold
-			 * the inode lock here since flushing out buffers may
-			 * cause us to try to get the lock in xfs_strategy().
-			 *
-			 * We don't have to call remapf() here, because there
-			 * cannot be any mapped file references to this vnode
-			 * since it is being reclaimed.
-			 */
-			xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-			/*
-			 * If we hit an IO error, we need to make sure that the
-			 * buffer and page caches of file data for
-			 * the file are tossed away. We don't want to use
-			 * VOP_FLUSHINVAL_PAGES here because we don't want dirty
-			 * pages to stay attached to the vnode, but be
-			 * marked P_BAD. pdflush/vnode_pagebad
-			 * hates that.
-			 */
-			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
-			} else {
-				VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
-			}
+	vn_iowait(vp);
 
-			ASSERT(VN_CACHED(vp) == 0);
-			ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
-			       ip->i_delayed_blks == 0);
-			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		} else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-			/*
-			 * di_size field may not be quite accurate if we're
-			 * shutting down.
-			 */
-			VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
-			ASSERT(VN_CACHED(vp) == 0);
-		}
-	}
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+	ASSERT(VN_CACHED(vp) == 0);
 
 	/* If we have nothing to flush with this inode then complete the
 	 * teardown now, otherwise break the link between the xfs inode
@@ -4567,7 +4525,7 @@ xfs_change_file_space(
 	/*
 	 * must be a regular file and have write permission
 	 */
-	if (vp->v_type != VREG)
+	if (!VN_ISREG(vp))
 		return XFS_ERROR(EINVAL);
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);