209 files changed, 6402 insertions, 2947 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 2e43d46f65d..cf12c403b8c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1005,7 +1005,8 @@ config TMPFS_POSIX_ACL
 
 config HUGETLBFS
 	bool "HugeTLB file system support"
-	depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || BROKEN
+	depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
+		   (S390 && 64BIT) || BROKEN
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 853845abcca..55e8ee1900a 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -41,7 +41,7 @@ config BINFMT_ELF_FDPIC
 	  It is also possible to run FDPIC ELF binaries on MMU linux also.
 
 config BINFMT_FLAT
-	tristate "Kernel support for flat binaries"
+	bool "Kernel support for flat binaries"
 	depends on !MMU
 	help
 	  Support uClinux FLAT format binaries.
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 936f2af39c4..831157502d5 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -75,7 +75,7 @@ extern unsigned int adfs_map_free(struct super_block *sb);
 /* Misc */
 void __adfs_error(struct super_block *sb, const char *function,
 		  const char *fmt, ...);
-#define adfs_error(sb, fmt...) __adfs_error(sb, __FUNCTION__, fmt)
+#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
 
 /* super.c */
 
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index b9b2b27b68c..ea7df214692 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -122,9 +122,9 @@ adfs_dir_checkbyte(const struct adfs_dir *dir)
 		ptr.ptr8 = bufoff(bh, i);
 		end.ptr8 = ptr.ptr8 + last - i;
 
-		do
+		do {
 			dircheck = *ptr.ptr8++ ^ ror13(dircheck);
-		while (ptr.ptr8 < end.ptr8);
+		} while (ptr.ptr8 < end.ptr8);
 	}
 
 	/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 6e0c9399200..1a4f092f24e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -325,8 +325,7 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
 	pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
 
 
-	if (block > (sector_t)0x7fffffffUL)
-		BUG();
+	BUG_ON(block > (sector_t)0x7fffffffUL);
 
 	if (block >= AFFS_I(inode)->i_blkcnt) {
 		if (block > AFFS_I(inode)->i_blkcnt || !create)
@@ -493,8 +492,7 @@ affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsign
 	u32 tmp;
 
 	pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
-	if (from > to || to > PAGE_CACHE_SIZE)
-		BUG();
+	BUG_ON(from > to || to > PAGE_CACHE_SIZE);
 	kmap(page);
 	data = page_address(page);
 	bsize = AFFS_SB(sb)->s_data_blksize;
@@ -507,8 +505,7 @@ affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsign
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
 		tmp = min(bsize - boff, to - from);
-		if (from + tmp > to || tmp > bsize)
-			BUG();
+		BUG_ON(from + tmp > to || tmp > bsize);
 		memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
 		affs_brelse(bh);
 		bidx++;
@@ -540,10 +537,9 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
 		tmp = min(bsize - boff, newsize - size);
-		if (boff + tmp > bsize || tmp > bsize)
-			BUG();
+		BUG_ON(boff + tmp > bsize || tmp > bsize);
 		memset(AFFS_DATA(bh) + boff, 0, tmp);
-		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(be32_to_cpu(AFFS_DATA_HEAD(bh)->size) + tmp);
+		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
 		affs_fix_checksum(sb, bh);
 		mark_buffer_dirty_inode(bh, inode);
 		size += tmp;
@@ -560,8 +556,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		if (IS_ERR(bh))
 			goto out;
 		tmp = min(bsize, newsize - size);
-		if (tmp > bsize)
-			BUG();
+		BUG_ON(tmp > bsize);
 		AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
 		AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
 		AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
@@ -683,10 +678,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		if (IS_ERR(bh))
 			return PTR_ERR(bh);
 		tmp = min(bsize - boff, to - from);
-		if (boff + tmp > bsize || tmp > bsize)
-			BUG();
+		BUG_ON(boff + tmp > bsize || tmp > bsize);
 		memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
-		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(be32_to_cpu(AFFS_DATA_HEAD(bh)->size) + tmp);
+		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
 		affs_fix_checksum(sb, bh);
 		mark_buffer_dirty_inode(bh, inode);
 		written += tmp;
@@ -732,8 +726,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 		if (IS_ERR(bh))
 			goto out;
 		tmp = min(bsize, to - from);
-		if (tmp > bsize)
-			BUG();
+		BUG_ON(tmp > bsize);
 		memcpy(AFFS_DATA(bh), data + from, tmp);
 		if (buffer_new(bh)) {
 			AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d2dc047cb47..01d25d53254 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -199,7 +199,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
 		case Opt_prefix:
 			/* Free any previous prefix */
 			kfree(*prefix);
-			*prefix = NULL;
 			*prefix = match_strdup(&args[0]);
 			if (!*prefix)
 				return 0;
@@ -233,6 +232,8 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
 			break;
 		case Opt_volume: {
 			char *vol = match_strdup(&args[0]);
+			if (!vol)
+				return 0;
 			strlcpy(volume, vol, 32);
 			kfree(vol);
 			break;
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
index 7b4d4fab4c8..255f5dd6040 100644
--- a/fs/afs/afs_cm.h
+++ b/fs/afs/afs_cm.h
@@ -24,7 +24,8 @@ enum AFS_CM_Operations {
 	CBGetXStatsVersion	= 209,	/* get version of extended statistics */
 	CBGetXStats		= 210,	/* get contents of extended statistics data */
 	CBInitCallBackState3	= 213,	/* initialise callback state, version 3 */
-	CBGetCapabilities	= 65538, /* get client capabilities */
+	CBProbeUuid		= 214,	/* check the client hasn't rebooted */
+	CBTellMeAboutYourself	= 65538, /* get client capabilities */
 };
 
 #define AFS_CAP_ERROR_TRANSLATION	0x1
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 584bb0f9c36..5e1df14e16b 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,7 +20,7 @@
 DECLARE_RWSEM(afs_proc_cells_sem);
 LIST_HEAD(afs_proc_cells);
 
-static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells);
+static LIST_HEAD(afs_cells);
 static DEFINE_RWLOCK(afs_cells_lock);
 static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
 static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 47b71c8947f..eb765489164 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -26,8 +26,9 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
 						struct sk_buff *, bool);
 static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
 static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
-static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *,
-					   bool);
+static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *,
+						 struct sk_buff *, bool);
 static void afs_cm_destructor(struct afs_call *);
 
 /*
@@ -71,11 +72,21 @@ static const struct afs_call_type afs_SRXCBProbe = {
 };
 
 /*
- * CB.GetCapabilities operation type
+ * CB.ProbeUuid operation type
  */
-static const struct afs_call_type afs_SRXCBGetCapabilites = {
-	.name		= "CB.GetCapabilities",
-	.deliver	= afs_deliver_cb_get_capabilities,
+static const struct afs_call_type afs_SRXCBProbeUuid = {
+	.name		= "CB.ProbeUuid",
+	.deliver	= afs_deliver_cb_probe_uuid,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
+
+/*
+ * CB.TellMeAboutYourself operation type
+ */
+static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
+	.name		= "CB.TellMeAboutYourself",
+	.deliver	= afs_deliver_cb_tell_me_about_yourself,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 };
@@ -103,8 +114,8 @@ bool afs_cm_incoming_call(struct afs_call *call)
 	case CBProbe:
 		call->type = &afs_SRXCBProbe;
 		return true;
-	case CBGetCapabilities:
-		call->type = &afs_SRXCBGetCapabilites;
+	case CBTellMeAboutYourself:
+		call->type = &afs_SRXCBTellMeAboutYourself;
 		return true;
 	default:
 		return false;
@@ -393,9 +404,105 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
 }
 
 /*
+ * allow the fileserver to quickly find out if the fileserver has been rebooted
+ */
+static void SRXAFSCB_ProbeUuid(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
+	struct afs_uuid *r = call->request;
+
+	struct {
+		__be32	match;
+	} reply;
+
+	_enter("");
+
+
+	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
+		reply.match = htonl(0);
+	else
+		reply.match = htonl(1);
+
+	afs_send_simple_reply(call, &reply, sizeof(reply));
+	_leave("");
+}
+
+/*
+ * deliver request data to a CB.ProbeUuid call
+ */
+static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
+				     bool last)
+{
+	struct afs_uuid *r;
+	unsigned loop;
+	__be32 *b;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->unmarshall++;
+
+	case 1:
+		_debug("extract UUID");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       11 * sizeof(__be32));
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		_debug("unmarshall UUID");
+		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		b = call->buffer;
+		r = call->request;
+		r->time_low			= ntohl(b[0]);
+		r->time_mid			= ntohl(b[1]);
+		r->time_hi_and_version		= ntohl(b[2]);
+		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
+		r->clock_seq_low		= ntohl(b[4]);
+
+		for (loop = 0; loop < 6; loop++)
+			r->node[loop] = ntohl(b[loop + 5]);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	call->state = AFS_CALL_REPLYING;
+
+	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
+	schedule_work(&call->work);
+	return 0;
+}
+
+/*
  * allow the fileserver to ask about the cache manager's capabilities
  */
-static void SRXAFSCB_GetCapabilities(struct work_struct *work)
+static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 {
 	struct afs_interface *ifs;
 	struct afs_call *call = container_of(work, struct afs_call, work);
@@ -456,10 +563,10 @@ static void SRXAFSCB_GetCapabilities(struct work_struct *work)
 }
 
 /*
- * deliver request data to a CB.GetCapabilities call
+ * deliver request data to a CB.TellMeAboutYourself call
  */
-static int afs_deliver_cb_get_capabilities(struct afs_call *call,
-					   struct sk_buff *skb, bool last)
+static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
+						 struct sk_buff *skb, bool last)
 {
 	_enter(",{%u},%d", skb->len, last);
 
@@ -471,7 +578,7 @@ static int afs_deliver_cb_get_capabilities(struct afs_call *call,
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
 
-	INIT_WORK(&call->work, SRXAFSCB_GetCapabilities);
+	INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
 	schedule_work(&call->work);
 	return 0;
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b58af8f18bc..dfda03d4397 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -140,7 +140,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
 
 	if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
 		printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
-		       __FUNCTION__, dir->i_ino, qty,
+		       __func__, dir->i_ino, qty,
 		       ntohs(dbuf->blocks[0].pagehdr.npages));
 		goto error;
 	}
@@ -159,7 +159,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
 	for (tmp = 0; tmp < qty; tmp++) {
 		if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
 			printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n",
-			       __FUNCTION__, dir->i_ino, tmp, qty,
+			       __func__, dir->i_ino, tmp, qty,
 			       ntohs(dbuf->blocks[tmp].pagehdr.magic));
 			goto error;
 		}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index eec41c76de7..7102824ba84 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -757,8 +757,8 @@ void _dbprintk(const char *fmt, ...)
 {
 }
 
-#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
-#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
 
 
@@ -791,8 +791,8 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
-#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define _debug(FMT,...)	_dbprintk("    "FMT ,##__VA_ARGS__)
 #endif
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 846c7615ac9..9f7d1ae7026 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -41,6 +41,7 @@ static const struct file_operations afs_proc_cells_fops = {
 	.write		= afs_proc_cells_write,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
+	.owner		= THIS_MODULE,
 };
 
 static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
@@ -56,7 +57,8 @@ static const struct file_operations afs_proc_rootcell_fops = {
 	.read		= afs_proc_rootcell_read,
 	.write		= afs_proc_rootcell_write,
 	.llseek		= no_llseek,
-	.release	= afs_proc_rootcell_release
+	.release	= afs_proc_rootcell_release,
+	.owner		= THIS_MODULE,
 };
 
 static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
@@ -80,6 +82,7 @@ static const struct file_operations afs_proc_cell_volumes_fops = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= afs_proc_cell_volumes_release,
+	.owner		= THIS_MODULE,
 };
 
 static int afs_proc_cell_vlservers_open(struct inode *inode,
@@ -104,6 +107,7 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= afs_proc_cell_vlservers_release,
+	.owner		= THIS_MODULE,
 };
 
 static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
@@ -127,6 +131,7 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= afs_proc_cell_servers_release,
+	.owner		= THIS_MODULE,
 };
 
 /*
@@ -143,17 +148,13 @@ int afs_proc_init(void)
 		goto error_dir;
 	proc_afs->owner = THIS_MODULE;
 
-	p = create_proc_entry("cells", 0, proc_afs);
+	p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
 	if (!p)
 		goto error_cells;
-	p->proc_fops = &afs_proc_cells_fops;
-	p->owner = THIS_MODULE;
 
-	p = create_proc_entry("rootcell", 0, proc_afs);
+	p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
 	if (!p)
 		goto error_rootcell;
-	p->proc_fops = &afs_proc_rootcell_fops;
-	p->owner = THIS_MODULE;
 
 	_leave(" = 0");
 	return 0;
@@ -395,26 +396,20 @@ int afs_proc_cell_setup(struct afs_cell *cell)
 	if (!cell->proc_dir)
 		goto error_dir;
 
-	p = create_proc_entry("servers", 0, cell->proc_dir);
+	p = proc_create_data("servers", 0, cell->proc_dir,
+			     &afs_proc_cell_servers_fops, cell);
 	if (!p)
 		goto error_servers;
-	p->proc_fops = &afs_proc_cell_servers_fops;
-	p->owner = THIS_MODULE;
-	p->data = cell;
 
-	p = create_proc_entry("vlservers", 0, cell->proc_dir);
+	p = proc_create_data("vlservers", 0, cell->proc_dir,
+			     &afs_proc_cell_vlservers_fops, cell);
 	if (!p)
 		goto error_vlservers;
-	p->proc_fops = &afs_proc_cell_vlservers_fops;
-	p->owner = THIS_MODULE;
-	p->data = cell;
 
-	p = create_proc_entry("volumes", 0, cell->proc_dir);
+	p = proc_create_data("volumes", 0, cell->proc_dir,
+			     &afs_proc_cell_volumes_fops, cell);
 	if (!p)
 		goto error_volumes;
-	p->proc_fops = &afs_proc_cell_volumes_fops;
-	p->owner = THIS_MODULE;
-	p->data = cell;
 
 	_leave(" = 0");
 	return 0;
diff --git a/fs/aio.c b/fs/aio.c
index ae94e1dea26..b5253e77eb2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,43 @@ static int aio_setup_ring(struct kioctx *ctx)
 	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
 
+
+/* __put_ioctx
+ *	Called when the last user of an aio context has gone away,
+ *	and the struct needs to be freed.
+ */
+static void __put_ioctx(struct kioctx *ctx)
+{
+	unsigned nr_events = ctx->max_reqs;
+
+	BUG_ON(ctx->reqs_active);
+
+	cancel_delayed_work(&ctx->wq);
+	cancel_work_sync(&ctx->wq.work);
+	aio_free_ring(ctx);
+	mmdrop(ctx->mm);
+	ctx->mm = NULL;
+	pr_debug("__put_ioctx: freeing %p\n", ctx);
+	kmem_cache_free(kioctx_cachep, ctx);
+
+	if (nr_events) {
+		spin_lock(&aio_nr_lock);
+		BUG_ON(aio_nr - nr_events > aio_nr);
+		aio_nr -= nr_events;
+		spin_unlock(&aio_nr_lock);
+	}
+}
+
+#define get_ioctx(kioctx) do {						\
+	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
+	atomic_inc(&(kioctx)->users);					\
+} while (0)
+#define put_ioctx(kioctx) do {						\
+	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
+	if (unlikely(atomic_dec_and_test(&(kioctx)->users))) 		\
+		__put_ioctx(kioctx);					\
+} while (0)
+
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
  */
@@ -240,7 +277,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (ctx->max_reqs == 0)
 		goto out_cleanup;
 
-	/* now link into global list.  kludge.  FIXME */
+	/* now link into global list. */
 	write_lock(&mm->ioctx_list_lock);
 	ctx->next = mm->ioctx_list;
 	mm->ioctx_list = ctx;
@@ -361,32 +398,6 @@ void exit_aio(struct mm_struct *mm)
 	}
 }
 
-/* __put_ioctx
- *	Called when the last user of an aio context has gone away,
- *	and the struct needs to be freed.
- */
-void __put_ioctx(struct kioctx *ctx)
-{
-	unsigned nr_events = ctx->max_reqs;
-
-	BUG_ON(ctx->reqs_active);
-
-	cancel_delayed_work(&ctx->wq);
-	cancel_work_sync(&ctx->wq.work);
-	aio_free_ring(ctx);
-	mmdrop(ctx->mm);
-	ctx->mm = NULL;
-	pr_debug("__put_ioctx: freeing %p\n", ctx);
-	kmem_cache_free(kioctx_cachep, ctx);
-
-	if (nr_events) {
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - nr_events > aio_nr);
-		aio_nr -= nr_events;
-		spin_unlock(&aio_nr_lock);
-	}
-}
-
 /* aio_get_req
  *	Allocate a slot for an aio request.  Increments the users count
  * of the kioctx so that the kioctx stays around until all requests are
@@ -542,10 +553,7 @@ int aio_put_req(struct kiocb *req)
 	return ret;
 }
 
-/*	Lookup an ioctx id.  ioctx_list is lockless for reads.
- *	FIXME: this is O(n) and is only suitable for development.
- */
-struct kioctx *lookup_ioctx(unsigned long ctx_id)
+static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct kioctx *ioctx;
 	struct mm_struct *mm;
@@ -1070,9 +1078,7 @@ static void timeout_func(unsigned long data)
 
 static inline void init_timeout(struct aio_timeout *to)
 {
-	init_timer(&to->timer);
-	to->timer.data = (unsigned long)to;
-	to->timer.function = timeout_func;
+	setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
 	to->timed_out = 0;
 	to->p = current;
 }
@@ -1205,6 +1211,7 @@ retry:
 	if (timeout)
 		clear_timeout(&to);
 out:
+	destroy_timer_on_stack(&to.timer);
 	return i ? i : ret;
 }
 
@@ -1552,7 +1559,7 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
 	return 1;
 }
 
-int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb)
 {
 	struct kiocb *req;
@@ -1593,7 +1600,7 @@ int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		 * event using the eventfd_signal() function.
 		 */
 		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
-		if (unlikely(IS_ERR(req->ki_eventfd))) {
+		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			goto out_put_req;
 		}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 2d4ae40718d..c3d352d7fa9 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -35,7 +35,7 @@
 /* #define DEBUG */
 
 #ifdef DEBUG
-#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __FUNCTION__ , ##args); } while(0)
+#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0)
 #else
 #define DPRINTK(fmt,args...) do {} while(0)
 #endif
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a54a946a50a..aa4c5ff8a40 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -533,9 +533,9 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
 			goto next;
 
 		if (d_unhashed(dentry)) {
-			struct autofs_info *ino = autofs4_dentry_ino(dentry);
 			struct inode *inode = dentry->d_inode;
 
+			ino = autofs4_dentry_ino(dentry);
 			list_del_init(&ino->rehash);
 			dget(dentry);
 			/*
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 82123ff3e1d..e8717de3bab 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -489,9 +489,9 @@ static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
 	if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-		char *p = nd_get_link(nd);
-		if (!IS_ERR(p))
-			kfree(p);
+		char *link = nd_get_link(nd);
+		if (!IS_ERR(link))
+			kfree(link);
 	}
 }
 
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 71faf4d2390..70f5d3a8eed 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -42,7 +42,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 
 
 #define printf(format, args...) \
-	printk(KERN_ERR "BFS-fs: %s(): " format, __FUNCTION__, ## args)
+	printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args)
 
 /* inode.c */
 extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index a1bb2244cac..ba4cddb92f1 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -372,21 +372,17 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			 
 		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
 	} else {
-		static unsigned long error_time, error_time2;
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
-		    (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
+		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
 		{
 			printk(KERN_NOTICE "executable not page aligned\n");
-			error_time2 = jiffies;
 		}
 
-		if ((fd_offset & ~PAGE_MASK) != 0 &&
-		    (jiffies-error_time) > 5*HZ)
+		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
 		{
 			printk(KERN_WARNING 
 			       "fd_offset is not page aligned. Please convert program: %s\n",
 			       bprm->file->f_path.dentry->d_name.name);
-			error_time = jiffies;
 		}
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
@@ -495,15 +491,13 @@ static int load_aout_library(struct file *file)
 	start_addr =  ex.a_entry & 0xfffff000;
 
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		static unsigned long error_time;
 		loff_t pos = N_TXTOFF(ex);
 
-		if ((jiffies-error_time) > 5*HZ)
+		if (printk_ratelimit())
 		{
 			printk(KERN_WARNING 
 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
 			       file->f_path.dentry->d_name.name);
-			error_time = jiffies;
 		}
 		down_write(&current->mm->mmap_sem);
 		do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9924581df6f..b25707fee2c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1255,26 +1255,23 @@ static int writenote(struct memelfnote *men, struct file *file,
 static void fill_elf_header(struct elfhdr *elf, int segs,
 			    u16 machine, u32 flags, u8 osabi)
 {
+	memset(elf, 0, sizeof(*elf));
+
 	memcpy(elf->e_ident, ELFMAG, SELFMAG);
 	elf->e_ident[EI_CLASS] = ELF_CLASS;
 	elf->e_ident[EI_DATA] = ELF_DATA;
 	elf->e_ident[EI_VERSION] = EV_CURRENT;
 	elf->e_ident[EI_OSABI] = ELF_OSABI;
-	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
 
 	elf->e_type = ET_CORE;
 	elf->e_machine = machine;
 	elf->e_version = EV_CURRENT;
-	elf->e_entry = 0;
 	elf->e_phoff = sizeof(struct elfhdr);
-	elf->e_shoff = 0;
 	elf->e_flags = flags;
 	elf->e_ehsize = sizeof(struct elfhdr);
 	elf->e_phentsize = sizeof(struct elf_phdr);
 	elf->e_phnum = segs;
-	elf->e_shentsize = 0;
-	elf->e_shnum = 0;
-	elf->e_shstrndx = 0;
+
 	return;
 }
 
@@ -1725,26 +1722,25 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 
 	info->thread_status_size = 0;
 	if (signr) {
-		struct elf_thread_status *tmp;
+		struct elf_thread_status *ets;
 		rcu_read_lock();
 		do_each_thread(g, p)
 			if (current->mm == p->mm && current != p) {
-				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
-				if (!tmp) {
+				ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
+				if (!ets) {
 					rcu_read_unlock();
 					return 0;
 				}
-				tmp->thread = p;
-				list_add(&tmp->list, &info->thread_list);
+				ets->thread = p;
+				list_add(&ets->list, &info->thread_list);
 			}
 		while_each_thread(g, p);
 		rcu_read_unlock();
 		list_for_each(t, &info->thread_list) {
-			struct elf_thread_status *tmp;
 			int sz;
 
-			tmp = list_entry(t, struct elf_thread_status, list);
-			sz = elf_dump_thread_status(signr, tmp);
+			ets = list_entry(t, struct elf_thread_status, list);
+			sz = elf_dump_thread_status(signr, ets);
 			info->thread_status_size += sz;
 		}
 	}
@@ -2000,10 +1996,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 
 		for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
 			struct page *page;
-			struct vm_area_struct *vma;
+			struct vm_area_struct *tmp_vma;
 
 			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
-						&page, &vma) <= 0) {
+						&page, &tmp_vma) <= 0) {
 				DUMP_SEEK(PAGE_SIZE);
 			} else {
 				if (page == ZERO_PAGE(0)) {
@@ -2013,7 +2009,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 					}
 				} else {
 					void *kaddr;
-					flush_cache_page(vma, addr,
+					flush_cache_page(tmp_vma, addr,
 							 page_to_pfn(page));
 					kaddr = kmap(page);
 					if ((size += PAGE_SIZE) > limit ||
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 32649f2a165..ddd35d87339 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -136,8 +136,8 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
 
 	retval = kernel_read(file, params->hdr.e_phoff,
 			     (char *) params->phdrs, size);
-	if (retval < 0)
-		return retval;
+	if (unlikely(retval != size))
+		return retval < 0 ? retval : -ENOEXEC;
 
 	/* determine stack size for this binary */
 	phdr = params->phdrs;
@@ -218,8 +218,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 					     phdr->p_offset,
 					     interpreter_name,
 					     phdr->p_filesz);
-			if (retval < 0)
+			if (unlikely(retval != phdr->p_filesz)) {
+				if (retval >= 0)
+					retval = -ENOEXEC;
 				goto error;
+			}
 
 			retval = -ENOENT;
 			if (interpreter_name[phdr->p_filesz - 1] != '\0')
@@ -245,8 +248,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 
 			retval = kernel_read(interpreter, 0, bprm->buf,
 					     BINPRM_BUF_SIZE);
-			if (retval < 0)
+			if (unlikely(retval != BINPRM_BUF_SIZE)) {
+				if (retval >= 0)
+					retval = -ENOEXEC;
 				goto error;
+			}
 
 			interp_params.hdr = *((struct elfhdr *) bprm->buf);
 			break;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f95ae9789c9..f9c88d0c8ce 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
 			return -ENOEXEC;
 	}
 
-	bprm->sh_bang++;	/* Well, the bang-shell is implicit... */
+	bprm->sh_bang = 1;	/* Well, the bang-shell is implicit... */
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 0498b181dd5..3b40d45a3a1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -531,7 +531,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 		DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
 
 		down_write(&current->mm->mmap_sem);
-		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE, 0);
+		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
+				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
 		up_write(&current->mm->mmap_sem);
 		if (!textpos  || textpos >= (unsigned long) -4096) {
 			if (!textpos)
@@ -932,14 +933,8 @@ static int __init init_flat_binfmt(void)
 	return register_binfmt(&flat_format);
 }
 
-static void __exit exit_flat_binfmt(void)
-{
-	unregister_binfmt(&flat_format);
-}
-
 /****************************************************************************/
 
 core_initcall(init_flat_binfmt);
-module_exit(exit_flat_binfmt);
 
 /****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index dbf0ac0523d..7191306367c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -115,6 +115,12 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (!enabled)
 		goto _ret;
 
+	retval = -ENOEXEC;
+	if (bprm->misc_bang)
+		goto _ret;
+
+	bprm->misc_bang = 1;
+
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
 	fmt = check_file(bprm);
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index ab33939b12a..9e3963f7ebf 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -29,7 +29,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	 * Sorta complicated, but hopefully it will work.  -TYT
 	 */
 
-	bprm->sh_bang++;
+	bprm->sh_bang = 1;
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index 6e0b6f66df0..799f86deff2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -937,6 +937,95 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 	return ERR_PTR(-EINVAL);
 }
 
+static void bio_copy_kern_endio(struct bio *bio, int err)
+{
+	struct bio_vec *bvec;
+	const int read = bio_data_dir(bio) == READ;
+	char *p = bio->bi_private;
+	int i;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		char *addr = page_address(bvec->bv_page);
+
+		if (read && !err)
+			memcpy(p, addr, bvec->bv_len);
+
+		__free_page(bvec->bv_page);
+		p += bvec->bv_len;
+	}
+
+	bio_put(bio);
+}
+
+/**
+ *	bio_copy_kern	-	copy kernel address into bio
+ *	@q: the struct request_queue for the bio
+ *	@data: pointer to buffer to copy
+ *	@len: length in bytes
+ *	@gfp_mask: allocation flags for bio and page allocation
+ *
+ *	copy the kernel address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
+			  gfp_t gfp_mask, int reading)
+{
+	unsigned long kaddr = (unsigned long)data;
+	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long start = kaddr >> PAGE_SHIFT;
+	const int nr_pages = end - start;
+	struct bio *bio;
+	struct bio_vec *bvec;
+	int i, ret;
+
+	bio = bio_alloc(gfp_mask, nr_pages);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+
+	while (len) {
+		struct page *page;
+		unsigned int bytes = PAGE_SIZE;
+
+		if (bytes > len)
+			bytes = len;
+
+		page = alloc_page(q->bounce_gfp | gfp_mask);
+		if (!page) {
+			ret = -ENOMEM;
+			goto cleanup;
+		}
+
+		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
+
+		len -= bytes;
+	}
+
+	if (!reading) {
+		void *p = data;
+
+		bio_for_each_segment(bvec, bio, i) {
+			char *addr = page_address(bvec->bv_page);
+
+			memcpy(addr, p, bvec->bv_len);
+			p += bvec->bv_len;
+		}
+	}
+
+	bio->bi_private = data;
+	bio->bi_end_io = bio_copy_kern_endio;
+	return bio;
+cleanup:
+	bio_for_each_segment(bvec, bio, i)
+		__free_page(bvec->bv_page);
+
+	bio_put(bio);
+
+	return ERR_PTR(ret);
+}
+
 /*
  * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
  * for performing direct-IO in BIOs.
@@ -1273,6 +1362,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs);
 EXPORT_SYMBOL(bio_map_user);
 EXPORT_SYMBOL(bio_unmap_user);
 EXPORT_SYMBOL(bio_map_kern);
+EXPORT_SYMBOL(bio_copy_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
 EXPORT_SYMBOL(bio_split_pool);
diff --git a/fs/buffer.c b/fs/buffer.c
index 3db4a26adc4..a073f3f4f01 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1101,7 +1101,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
 
 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
 			"device %s\n",
-			__FUNCTION__, (unsigned long long)block,
+			__func__, (unsigned long long)block,
 			bdevname(bdev, b));
 		return -EIO;
 	}
@@ -2211,8 +2211,8 @@ out:
 	return err;
 }
 
-int cont_expand_zero(struct file *file, struct address_space *mapping,
-			loff_t pos, loff_t *bytes)
+static int cont_expand_zero(struct file *file, struct address_space *mapping,
+			    loff_t pos, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
 	unsigned blocksize = 1 << inode->i_blkbits;
@@ -2328,23 +2328,6 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
 	return 0;
 }
 
-int generic_commit_write(struct file *file, struct page *page,
-		unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-	__block_commit_write(inode,page,from,to);
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
-	 */
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
-}
-
 /*
  * block_page_mkwrite() is not allowed to change the file size as it gets
  * called from a page fault handler when a page is first dirtied. Hence we must
@@ -3315,7 +3298,6 @@ EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
-EXPORT_SYMBOL(generic_commit_write);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 038674aa88a..68e510b8845 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -55,7 +55,6 @@ static struct char_device_struct {
 	unsigned int baseminor;
 	int minorct;
 	char name[64];
-	struct file_operations *fops;
 	struct cdev *cdev;		/* will die */
 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
 
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 0228ed06069..cc950f69e51 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -468,7 +468,7 @@ cifs_proc_init(void)
 {
 	struct proc_dir_entry *pde;
 
-	proc_fs_cifs = proc_mkdir("cifs", proc_root_fs);
+	proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
 	if (proc_fs_cifs == NULL)
 		return;
 
@@ -559,7 +559,7 @@ cifs_proc_clean(void)
 	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
 	remove_proc_entry("Experimental", proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
-	remove_proc_entry("cifs", proc_root_fs);
+	remove_proc_entry("fs/cifs", NULL);
 }
 
 static int
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 95a54253c04..e1c854890f9 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -134,7 +134,7 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
         unsigned int valid;
 
         /* clean out */        
-        vattr->va_mode = (umode_t) -1;
+	vattr->va_mode = -1;
         vattr->va_uid = (vuid_t) -1; 
         vattr->va_gid = (vgid_t) -1;
         vattr->va_size = (off_t) -1;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index f89ff083079..3d2580e00a3 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -345,7 +345,7 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
 }
 
 /* destruction routines: unlink, rmdir */
-int coda_unlink(struct inode *dir, struct dentry *de)
+static int coda_unlink(struct inode *dir, struct dentry *de)
 {
         int error;
 	const char *name = de->d_name.name;
@@ -365,7 +365,7 @@ int coda_unlink(struct inode *dir, struct dentry *de)
 	return 0;
 }
 
-int coda_rmdir(struct inode *dir, struct dentry *de)
+static int coda_rmdir(struct inode *dir, struct dentry *de)
 {
 	const char *name = de->d_name.name;
 	int len = de->d_name.len;
@@ -424,7 +424,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 
 /* file operations for directories */
-int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
 {
 	struct coda_file_info *cfi;
 	struct file *host_file;
diff --git a/fs/compat.c b/fs/compat.c
index 2ce4456aad3..139dc93c092 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1634,7 +1634,7 @@ sticky:
 	return ret;
 }
 
-#ifdef TIF_RESTORE_SIGMASK
+#ifdef HAVE_SET_RESTORE_SIGMASK
 asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
@@ -1720,7 +1720,7 @@ sticky:
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 					sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -1791,7 +1791,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 				sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 		ret = -ERESTARTNOHAND;
 	} else if (sigmask)
@@ -1825,7 +1825,7 @@ sticky:
 
 	return ret;
 }
-#endif /* TIF_RESTORE_SIGMASK */
+#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 #if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
 /* Stuff for NFS server syscalls... */
@@ -2080,7 +2080,7 @@ long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
 
 #ifdef CONFIG_EPOLL
 
-#ifdef TIF_RESTORE_SIGMASK
+#ifdef HAVE_SET_RESTORE_SIGMASK
 asmlinkage long compat_sys_epoll_pwait(int epfd,
 			struct compat_epoll_event __user *events,
 			int maxevents, int timeout,
@@ -2117,14 +2117,14 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 		if (err == -EINTR) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 			       sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		} else
 			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
 
 	return err;
 }
-#endif /* TIF_RESTORE_SIGMASK */
+#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 #endif /* CONFIG_EPOLL */
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c6e72aebd16..97dba0d9234 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1046,14 +1046,14 @@ static int vt_check(struct file *file)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct vc_data *vc;
 	
-	if (file->f_op->ioctl != tty_ioctl)
+	if (file->f_op->unlocked_ioctl != tty_ioctl)
 		return -EINVAL;
 	                
 	tty = (struct tty_struct *)file->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 	                                                
-	if (tty->driver->ioctl != vt_ioctl)
+	if (tty->ops->ioctl != vt_ioctl)
 		return -EINVAL;
 
 	vc = (struct vc_data *)tty->driver_data;
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 397cb503a18..2b6cb23dd14 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -115,7 +115,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
 			goto out;
 	}
 	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
-		 __FUNCTION__, count, *ppos, buffer->page);
+		 __func__, count, *ppos, buffer->page);
 	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
 					 buffer->count);
 out:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4c1ebff778e..b9a1d810346 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -47,7 +47,7 @@ static const struct address_space_operations configfs_aops = {
 
 static struct backing_dev_info configfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static const struct inode_operations configfs_inode_operations ={
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index de3b31d0a37..8421cea7d8c 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -92,7 +92,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	root = d_alloc_root(inode);
 	if (!root) {
-		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		pr_debug("%s: could not get root dentry!\n",__func__);
 		iput(inode);
 		return -ENOMEM;
 	}
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 78929ea84ff..2a731ef5f30 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -210,13 +210,13 @@ static int configfs_get_target_path(struct config_item * item, struct config_ite
 	if (size > PATH_MAX)
 		return -ENAMETOOLONG;
 
-	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+	pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size);
 
 	for (s = path; depth--; s += 3)
 		strcpy(s,"../");
 
 	fill_item_path(target, path, size);
-	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+	pr_debug("%s: path = '%s'\n", __func__, path);
 
 	return 0;
 }
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f120e120787..285b64a8b06 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -17,6 +17,8 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/tty.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
 #include <linux/devpts_fs.h>
 #include <linux/parser.h>
 #include <linux/fsnotify.h>
@@ -26,6 +28,10 @@
 
 #define DEVPTS_DEFAULT_MODE 0600
 
+extern int pty_limit;			/* Config limit on Unix98 ptys */
+static DEFINE_IDR(allocated_ptys);
+static DEFINE_MUTEX(allocated_ptys_lock);
+
 static struct vfsmount *devpts_mnt;
 static struct dentry *devpts_root;
 
@@ -171,9 +177,44 @@ static struct dentry *get_node(int num)
 	return lookup_one_len(s, root, sprintf(s, "%d", num));
 }
 
+int devpts_new_index(void)
+{
+	int index;
+	int idr_ret;
+
+retry:
+	if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) {
+		return -ENOMEM;
+	}
+
+	mutex_lock(&allocated_ptys_lock);
+	idr_ret = idr_get_new(&allocated_ptys, NULL, &index);
+	if (idr_ret < 0) {
+		mutex_unlock(&allocated_ptys_lock);
+		if (idr_ret == -EAGAIN)
+			goto retry;
+		return -EIO;
+	}
+
+	if (index >= pty_limit) {
+		idr_remove(&allocated_ptys, index);
+		mutex_unlock(&allocated_ptys_lock);
+		return -EIO;
+	}
+	mutex_unlock(&allocated_ptys_lock);
+	return index;
+}
+
+void devpts_kill_index(int idx)
+{
+	mutex_lock(&allocated_ptys_lock);
+	idr_remove(&allocated_ptys, idx);
+	mutex_unlock(&allocated_ptys_lock);
+}
+
 int devpts_pty_new(struct tty_struct *tty)
 {
-	int number = tty->index;
+	int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
 	struct tty_driver *driver = tty->driver;
 	dev_t device = MKDEV(driver->major, driver->minor_start+number);
 	struct dentry *dentry;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index b64e55e0515..499e16759e9 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -200,7 +200,7 @@ int __init dlm_lockspace_init(void)
 
 	dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
 	if (!dlm_kset) {
-		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
+		printk(KERN_WARNING "%s: can not create kset\n", __func__);
 		return -ENOMEM;
 	}
 	return 0;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 59375efcf39..3e5637fc377 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -14,18 +14,26 @@ int sysctl_drop_caches;
 
 static void drop_pagecache_sb(struct super_block *sb)
 {
-	struct inode *inode;
+	struct inode *inode, *toput_inode = NULL;
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_FREEING|I_WILL_FREE))
 			continue;
+		if (inode->i_mapping->nrpages == 0)
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
 		__invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
+		iput(toput_inode);
+		toput_inode = inode;
+		spin_lock(&inode_lock);
 	}
 	spin_unlock(&inode_lock);
+	iput(toput_inode);
 }
 
-void drop_pagecache(void)
+static void drop_pagecache(void)
 {
 	struct super_block *sb;
 
@@ -45,7 +53,7 @@ restart:
 	spin_unlock(&sb_lock);
 }
 
-void drop_slab(void)
+static void drop_slab(void)
 {
 	int nr_objects;
 
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 76885701551..1e34a7fd488 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
 
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o debug.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a066e109ad9..cd62d75b2cc 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -119,21 +119,21 @@ static int ecryptfs_calculate_md5(char *dst,
 	if (rc) {
 		printk(KERN_ERR
 		       "%s: Error initializing crypto hash; rc = [%d]\n",
-		       __FUNCTION__, rc);
+		       __func__, rc);
 		goto out;
 	}
 	rc = crypto_hash_update(&desc, &sg, len);
 	if (rc) {
 		printk(KERN_ERR
 		       "%s: Error updating crypto hash; rc = [%d]\n",
-		       __FUNCTION__, rc);
+		       __func__, rc);
 		goto out;
 	}
 	rc = crypto_hash_final(&desc, dst);
 	if (rc) {
 		printk(KERN_ERR
 		       "%s: Error finalizing crypto hash; rc = [%d]\n",
-		       __FUNCTION__, rc);
+		       __func__, rc);
 		goto out;
 	}
 out:
@@ -437,7 +437,7 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
 	if (rc < 0) {
 		printk(KERN_ERR "%s: Error attempting to encrypt page with "
 		       "page->index = [%ld], extent_offset = [%ld]; "
-		       "rc = [%d]\n", __FUNCTION__, page->index, extent_offset,
+		       "rc = [%d]\n", __func__, page->index, extent_offset,
 		       rc);
 		goto out;
 	}
@@ -487,7 +487,7 @@ int ecryptfs_encrypt_page(struct page *page)
 						       0, PAGE_CACHE_SIZE);
 		if (rc)
 			printk(KERN_ERR "%s: Error attempting to copy "
-			       "page at index [%ld]\n", __FUNCTION__,
+			       "page at index [%ld]\n", __func__,
 			       page->index);
 		goto out;
 	}
@@ -508,7 +508,7 @@ int ecryptfs_encrypt_page(struct page *page)
 					     extent_offset);
 		if (rc) {
 			printk(KERN_ERR "%s: Error encrypting extent; "
-			       "rc = [%d]\n", __FUNCTION__, rc);
+			       "rc = [%d]\n", __func__, rc);
 			goto out;
 		}
 		ecryptfs_lower_offset_for_extent(
@@ -569,7 +569,7 @@ static int ecryptfs_decrypt_extent(struct page *page,
 	if (rc < 0) {
 		printk(KERN_ERR "%s: Error attempting to decrypt to page with "
 		       "page->index = [%ld], extent_offset = [%ld]; "
-		       "rc = [%d]\n", __FUNCTION__, page->index, extent_offset,
+		       "rc = [%d]\n", __func__, page->index, extent_offset,
 		       rc);
 		goto out;
 	}
@@ -622,7 +622,7 @@ int ecryptfs_decrypt_page(struct page *page)
 						      ecryptfs_inode);
 		if (rc)
 			printk(KERN_ERR "%s: Error attempting to copy "
-			       "page at index [%ld]\n", __FUNCTION__,
+			       "page at index [%ld]\n", __func__,
 			       page->index);
 		goto out;
 	}
@@ -656,7 +656,7 @@ int ecryptfs_decrypt_page(struct page *page)
 					     extent_offset);
 		if (rc) {
 			printk(KERN_ERR "%s: Error encrypting extent; "
-			       "rc = [%d]\n", __FUNCTION__, rc);
+			       "rc = [%d]\n", __func__, rc);
 			goto out;
 		}
 	}
@@ -1215,7 +1215,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
 				 ecryptfs_inode);
 	if (rc) {
 		printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
-		       __FUNCTION__, rc);
+		       __func__, rc);
 		goto out;
 	}
 	if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
@@ -1246,7 +1246,6 @@ ecryptfs_write_header_metadata(char *virt,
 	(*written) = 6;
 }
 
-struct kmem_cache *ecryptfs_header_cache_0;
 struct kmem_cache *ecryptfs_header_cache_1;
 struct kmem_cache *ecryptfs_header_cache_2;
 
@@ -1320,7 +1319,7 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
 				  0, crypt_stat->num_header_bytes_at_front);
 	if (rc)
 		printk(KERN_ERR "%s: Error attempting to write header "
-		       "information to lower file; rc = [%d]\n", __FUNCTION__,
+		       "information to lower file; rc = [%d]\n", __func__,
 		       rc);
 	return rc;
 }
@@ -1365,14 +1364,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		}
 	} else {
 		printk(KERN_WARNING "%s: Encrypted flag not set\n",
-		       __FUNCTION__);
+		       __func__);
 		rc = -EINVAL;
 		goto out;
 	}
 	/* Released in this function */
 	virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL);
 	if (!virt) {
-		printk(KERN_ERR "%s: Out of memory\n", __FUNCTION__);
+		printk(KERN_ERR "%s: Out of memory\n", __func__);
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -1380,7 +1379,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 					 ecryptfs_dentry);
 	if (unlikely(rc)) {
 		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
-		       __FUNCTION__, rc);
+		       __func__, rc);
 		goto out_free;
 	}
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
@@ -1391,7 +1390,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 							 ecryptfs_dentry, virt);
 	if (rc) {
 		printk(KERN_ERR "%s: Error writing metadata out to lower file; "
-		       "rc = [%d]\n", __FUNCTION__, rc);
+		       "rc = [%d]\n", __func__, rc);
 		goto out_free;
 	}
 out_free:
@@ -1585,7 +1584,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 	if (!page_virt) {
 		rc = -ENOMEM;
 		printk(KERN_ERR "%s: Unable to allocate page_virt\n",
-		       __FUNCTION__);
+		       __func__);
 		goto out;
 	}
 	rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5007f788da0..951ee33a022 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -4,7 +4,7 @@
  *
  * Copyright (C) 1997-2003 Erez Zadok
  * Copyright (C) 2001-2003 Stony Brook University
- * Copyright (C) 2004-2007 International Business Machines Corp.
+ * Copyright (C) 2004-2008 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
  *              Trevor S. Highland <trevor.highland@gmail.com>
  *              Tyler Hicks <tyhicks@ou.edu>
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/scatterlist.h>
 #include <linux/hash.h>
+#include <linux/nsproxy.h>
 
 /* Version verification for shared data structures w/ userspace */
 #define ECRYPTFS_VERSION_MAJOR 0x00
@@ -49,11 +50,13 @@
 #define ECRYPTFS_VERSIONING_POLICY                0x00000008
 #define ECRYPTFS_VERSIONING_XATTR                 0x00000010
 #define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
+#define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
 				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
 				  | ECRYPTFS_VERSIONING_PUBKEY \
 				  | ECRYPTFS_VERSIONING_XATTR \
-				  | ECRYPTFS_VERSIONING_MULTKEY)
+				  | ECRYPTFS_VERSIONING_MULTKEY \
+				  | ECRYPTFS_VERSIONING_DEVMISC)
 #define ECRYPTFS_MAX_PASSWORD_LENGTH 64
 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
 #define ECRYPTFS_SALT_SIZE 8
@@ -73,17 +76,14 @@
 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32
 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ
 #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3)
-#define ECRYPTFS_NLMSG_HELO 100
-#define ECRYPTFS_NLMSG_QUIT 101
-#define ECRYPTFS_NLMSG_REQUEST 102
-#define ECRYPTFS_NLMSG_RESPONSE 103
 #define ECRYPTFS_MAX_PKI_NAME_BYTES 16
 #define ECRYPTFS_DEFAULT_NUM_USERS 4
 #define ECRYPTFS_MAX_NUM_USERS 32768
 #define ECRYPTFS_TRANSPORT_NETLINK 0
 #define ECRYPTFS_TRANSPORT_CONNECTOR 1
 #define ECRYPTFS_TRANSPORT_RELAYFS 2
-#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_NETLINK
+#define ECRYPTFS_TRANSPORT_MISCDEV 3
+#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV
 #define ECRYPTFS_XATTR_NAME "user.ecryptfs"
 
 #define RFC2440_CIPHER_DES3_EDE 0x02
@@ -366,32 +366,63 @@ struct ecryptfs_auth_tok_list_item {
 };
 
 struct ecryptfs_message {
+	/* Can never be greater than ecryptfs_message_buf_len */
+	/* Used to find the parent msg_ctx */
+	/* Inherits from msg_ctx->index */
 	u32 index;
 	u32 data_len;
 	u8 data[];
 };
 
 struct ecryptfs_msg_ctx {
-#define ECRYPTFS_MSG_CTX_STATE_FREE      0x0001
-#define ECRYPTFS_MSG_CTX_STATE_PENDING   0x0002
-#define ECRYPTFS_MSG_CTX_STATE_DONE      0x0003
-	u32 state;
-	unsigned int index;
-	unsigned int counter;
+#define ECRYPTFS_MSG_CTX_STATE_FREE     0x01
+#define ECRYPTFS_MSG_CTX_STATE_PENDING  0x02
+#define ECRYPTFS_MSG_CTX_STATE_DONE     0x03
+#define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04
+	u8 state;
+#define ECRYPTFS_MSG_HELO 100
+#define ECRYPTFS_MSG_QUIT 101
+#define ECRYPTFS_MSG_REQUEST 102
+#define ECRYPTFS_MSG_RESPONSE 103
+	u8 type;
+	u32 index;
+	/* Counter converts to a sequence number. Each message sent
+	 * out for which we expect a response has an associated
+	 * sequence number. The response must have the same sequence
+	 * number as the counter for the msg_stc for the message to be
+	 * valid. */
+	u32 counter;
+	size_t msg_size;
 	struct ecryptfs_message *msg;
 	struct task_struct *task;
 	struct list_head node;
+	struct list_head daemon_out_list;
 	struct mutex mux;
 };
 
 extern unsigned int ecryptfs_transport;
 
-struct ecryptfs_daemon_id {
-	pid_t pid;
-	uid_t uid;
-	struct hlist_node id_chain;
+struct ecryptfs_daemon;
+
+struct ecryptfs_daemon {
+#define ECRYPTFS_DAEMON_IN_READ      0x00000001
+#define ECRYPTFS_DAEMON_IN_POLL      0x00000002
+#define ECRYPTFS_DAEMON_ZOMBIE       0x00000004
+#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008
+	u32 flags;
+	u32 num_queued_msg_ctx;
+	struct pid *pid;
+	uid_t euid;
+	struct user_namespace *user_ns;
+	struct task_struct *task;
+	struct mutex mux;
+	struct list_head msg_ctx_out_queue;
+	wait_queue_head_t wait;
+	struct hlist_node euid_chain;
 };
 
+extern struct mutex ecryptfs_daemon_hash_mux;
+
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
@@ -500,7 +531,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 }
 
 #define ecryptfs_printk(type, fmt, arg...) \
-        __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg);
+        __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
 void __ecryptfs_printk(const char *fmt, ...);
 
 extern const struct file_operations ecryptfs_main_fops;
@@ -581,10 +612,13 @@ int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
-int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid);
-int ecryptfs_process_quit(uid_t uid, pid_t pid);
-int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid,
-			      pid_t pid, u32 seq);
+int ecryptfs_process_helo(unsigned int transport, uid_t euid,
+			  struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid);
+int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
+			      struct user_namespace *user_ns, struct pid *pid,
+			      u32 seq);
 int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
 			  struct ecryptfs_msg_ctx **msg_ctx);
 int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
@@ -593,14 +627,14 @@ int ecryptfs_init_messaging(unsigned int transport);
 void ecryptfs_release_messaging(unsigned int transport);
 
 int ecryptfs_send_netlink(char *data, int data_len,
-			  struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type,
-			  u16 msg_flags, pid_t daemon_pid);
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct pid *daemon_pid);
 int ecryptfs_init_netlink(void);
 void ecryptfs_release_netlink(void);
 
 int ecryptfs_send_connector(char *data, int data_len,
-			    struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type,
-			    u16 msg_flags, pid_t daemon_pid);
+			    struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			    u16 msg_flags, struct pid *daemon_pid);
 int ecryptfs_init_connector(void);
 void ecryptfs_release_connector(void);
 void
@@ -642,5 +676,21 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 				     size_t offset_in_page, size_t size,
 				     struct inode *ecryptfs_inode);
 struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+				 struct user_namespace *user_ns);
+int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
+				 size_t *length_size);
+int ecryptfs_write_packet_length(char *dest, size_t size,
+				 size_t *packet_size_length);
+int ecryptfs_init_ecryptfs_miscdev(void);
+void ecryptfs_destroy_ecryptfs_miscdev(void);
+int ecryptfs_send_miscdev(char *data, size_t data_size,
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct ecryptfs_daemon *daemon);
+void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
+int
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+		      struct user_namespace *user_ns, struct pid *pid);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2b8f5ed4ade..2258b8f654a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -195,7 +195,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		file, ecryptfs_inode_to_private(inode)->lower_file);
 	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
 		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		mutex_lock(&crypt_stat->cs_mutex);
 		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		mutex_unlock(&crypt_stat->cs_mutex);
 		rc = 0;
 		goto out;
 	}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2386115210..0a1397335a8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -111,7 +111,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	if (unlikely(IS_ERR(lower_dir_dentry))) {
+	if (IS_ERR(lower_dir_dentry)) {
 		ecryptfs_printk(KERN_ERR, "Error locking directory of "
 				"dentry\n");
 		rc = PTR_ERR(lower_dir_dentry);
@@ -121,7 +121,7 @@ ecryptfs_do_create(struct inode *directory_inode,
 					     ecryptfs_dentry, mode, nd);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
-		       "rc = [%d]\n", __FUNCTION__, rc);
+		       "rc = [%d]\n", __func__, rc);
 		goto out_lock;
 	}
 	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
@@ -908,7 +908,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 	if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
 		ia->ia_valid &= ~ATTR_MODE;
 
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
 	rc = notify_change(lower_dentry, ia);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
 	fsstack_copy_attr_all(inode, lower_inode, NULL);
 	return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 682b1b2482c..e82b457180b 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -65,7 +65,7 @@ static int process_request_key_err(long err_code)
 }
 
 /**
- * parse_packet_length
+ * ecryptfs_parse_packet_length
  * @data: Pointer to memory containing length at offset
  * @size: This function writes the decoded size to this memory
  *        address; zero on error
@@ -73,8 +73,8 @@ static int process_request_key_err(long err_code)
  *
  * Returns zero on success; non-zero on error
  */
-static int parse_packet_length(unsigned char *data, size_t *size,
-			       size_t *length_size)
+int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
+				 size_t *length_size)
 {
 	int rc = 0;
 
@@ -105,7 +105,7 @@ out:
 }
 
 /**
- * write_packet_length
+ * ecryptfs_write_packet_length
  * @dest: The byte array target into which to write the length. Must
  *        have at least 5 bytes allocated.
  * @size: The length to write.
@@ -114,8 +114,8 @@ out:
  *
  * Returns zero on success; non-zero on error.
  */
-static int write_packet_length(char *dest, size_t size,
-			       size_t *packet_size_length)
+int ecryptfs_write_packet_length(char *dest, size_t size,
+				 size_t *packet_size_length)
 {
 	int rc = 0;
 
@@ -162,8 +162,8 @@ write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key,
 		goto out;
 	}
 	message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE;
-	rc = write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
-				 &packet_size_len);
+	rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
+					  &packet_size_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet "
 				"header; cannot generate packet length\n");
@@ -172,8 +172,9 @@ write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key,
 	i += packet_size_len;
 	memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX);
 	i += ECRYPTFS_SIG_SIZE_HEX;
-	rc = write_packet_length(&message[i], session_key->encrypted_key_size,
-				 &packet_size_len);
+	rc = ecryptfs_write_packet_length(&message[i],
+					  session_key->encrypted_key_size,
+					  &packet_size_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet "
 				"header; cannot generate packet length\n");
@@ -225,7 +226,7 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code,
 		rc = -EIO;
 		goto out;
 	}
-	rc = parse_packet_length(&data[i], &m_size, &data_len);
+	rc = ecryptfs_parse_packet_length(&data[i], &m_size, &data_len);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
 				"rc = [%d]\n", rc);
@@ -304,8 +305,8 @@ write_tag_66_packet(char *signature, u8 cipher_code,
 		goto out;
 	}
 	message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE;
-	rc = write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
-				 &packet_size_len);
+	rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX,
+					  &packet_size_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet "
 				"header; cannot generate packet length\n");
@@ -315,8 +316,8 @@ write_tag_66_packet(char *signature, u8 cipher_code,
 	memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX);
 	i += ECRYPTFS_SIG_SIZE_HEX;
 	/* The encrypted key includes 1 byte cipher code and 2 byte checksum */
-	rc = write_packet_length(&message[i], crypt_stat->key_size + 3,
-				 &packet_size_len);
+	rc = ecryptfs_write_packet_length(&message[i], crypt_stat->key_size + 3,
+					  &packet_size_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet "
 				"header; cannot generate packet length\n");
@@ -357,20 +358,25 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
 	/* verify that everything through the encrypted FEK size is present */
 	if (message_len < 4) {
 		rc = -EIO;
+		printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+		       "message length is [%d]\n", __func__, message_len, 4);
 		goto out;
 	}
 	if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) {
-		ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_67\n");
 		rc = -EIO;
+		printk(KERN_ERR "%s: Type should be ECRYPTFS_TAG_67\n",
+		       __func__);
 		goto out;
 	}
 	if (data[i++]) {
-		ecryptfs_printk(KERN_ERR, "Status indicator has non zero value"
-				" [%d]\n", data[i-1]);
 		rc = -EIO;
+		printk(KERN_ERR "%s: Status indicator has non zero "
+		       "value [%d]\n", __func__, data[i-1]);
+
 		goto out;
 	}
-	rc = parse_packet_length(&data[i], &key_rec->enc_key_size, &data_len);
+	rc = ecryptfs_parse_packet_length(&data[i], &key_rec->enc_key_size,
+					  &data_len);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
 				"rc = [%d]\n", rc);
@@ -378,17 +384,17 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
 	}
 	i += data_len;
 	if (message_len < (i + key_rec->enc_key_size)) {
-		ecryptfs_printk(KERN_ERR, "message_len [%d]; max len is [%d]\n",
-				message_len, (i + key_rec->enc_key_size));
 		rc = -EIO;
+		printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+		       __func__, message_len, (i + key_rec->enc_key_size));
 		goto out;
 	}
 	if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
-		ecryptfs_printk(KERN_ERR, "Encrypted key_size [%d] larger than "
-				"the maximum key size [%d]\n",
-				key_rec->enc_key_size,
-				ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
 		rc = -EIO;
+		printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+		       "the maximum key size [%d]\n", __func__,
+		       key_rec->enc_key_size,
+		       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
 		goto out;
 	}
 	memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size);
@@ -445,7 +451,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key),
 				 &netlink_message, &netlink_message_length);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet");
+		ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n");
 		goto out;
 	}
 	rc = ecryptfs_send_message(ecryptfs_transport, netlink_message,
@@ -570,8 +576,8 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
 		goto out;
 	}
 	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
-	rc = parse_packet_length(&data[(*packet_size)], &body_size,
-				 &length_size);
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
 	if (rc) {
 		printk(KERN_WARNING "Error parsing packet length; "
 		       "rc = [%d]\n", rc);
@@ -704,8 +710,8 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
 		goto out;
 	}
 	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
-	rc = parse_packet_length(&data[(*packet_size)], &body_size,
-				 &length_size);
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
 	if (rc) {
 		printk(KERN_WARNING "Error parsing packet length; rc = [%d]\n",
 		       rc);
@@ -852,8 +858,8 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = parse_packet_length(&data[(*packet_size)], &body_size,
-				 &length_size);
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size,
+					  &length_size);
 	if (rc) {
 		printk(KERN_WARNING "Invalid tag 11 packet format\n");
 		goto out;
@@ -1405,8 +1411,8 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes,
 			auth_tok->token.private_key.key_size;
 	rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Failed to encrypt session key "
-				"via a pki");
+		printk(KERN_ERR "Failed to encrypt session key via a key "
+		       "module; rc = [%d]\n", rc);
 		goto out;
 	}
 	if (ecryptfs_verbosity > 0) {
@@ -1430,8 +1436,9 @@ encrypted_session_key_set:
 		goto out;
 	}
 	dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE;
-	rc = write_packet_length(&dest[(*packet_size)], (max_packet_size - 4),
-				 &packet_size_length);
+	rc = ecryptfs_write_packet_length(&dest[(*packet_size)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet "
 				"header; cannot generate packet length\n");
@@ -1489,8 +1496,9 @@ write_tag_11_packet(char *dest, size_t *remaining_bytes, char *contents,
 		goto out;
 	}
 	dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
-	rc = write_packet_length(&dest[(*packet_length)],
-				 (max_packet_size - 4), &packet_size_length);
+	rc = ecryptfs_write_packet_length(&dest[(*packet_length)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
 	if (rc) {
 		printk(KERN_ERR "Error generating tag 11 packet header; cannot "
 		       "generate packet length. rc = [%d]\n", rc);
@@ -1682,8 +1690,9 @@ encrypted_session_key_set:
 	dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
 	/* Chop off the Tag 3 identifier(1) and Tag 3 packet size(3)
 	 * to get the number of octets in the actual Tag 3 packet */
-	rc = write_packet_length(&dest[(*packet_size)], (max_packet_size - 4),
-				 &packet_size_length);
+	rc = ecryptfs_write_packet_length(&dest[(*packet_size)],
+					  (max_packet_size - 4),
+					  &packet_size_length);
 	if (rc) {
 		printk(KERN_ERR "Error generating tag 3 packet header; cannot "
 		       "generate packet length. rc = [%d]\n", rc);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d25ac9500a9..d603631601e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -219,7 +219,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to initialize the "
 		       "persistent file for the dentry with name [%s]; "
-		       "rc = [%d]\n", __FUNCTION__, dentry->d_name.name, rc);
+		       "rc = [%d]\n", __func__, dentry->d_name.name, rc);
 		goto out;
 	}
 out:
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 9cc2aec27b0..1b5c20058ac 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -1,7 +1,7 @@
 /**
  * eCryptfs: Linux filesystem encryption layer
  *
- * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Copyright (C) 2004-2008 International Business Machines Corp.
  *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
  *		Tyler Hicks <tyhicks@ou.edu>
  *
@@ -20,19 +20,21 @@
  * 02111-1307, USA.
  */
 #include <linux/sched.h>
+#include <linux/user_namespace.h>
+#include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
 
 static LIST_HEAD(ecryptfs_msg_ctx_free_list);
 static LIST_HEAD(ecryptfs_msg_ctx_alloc_list);
 static struct mutex ecryptfs_msg_ctx_lists_mux;
 
-static struct hlist_head *ecryptfs_daemon_id_hash;
-static struct mutex ecryptfs_daemon_id_hash_mux;
+static struct hlist_head *ecryptfs_daemon_hash;
+struct mutex ecryptfs_daemon_hash_mux;
 static int ecryptfs_hash_buckets;
 #define ecryptfs_uid_hash(uid) \
         hash_long((unsigned long)uid, ecryptfs_hash_buckets)
 
-static unsigned int ecryptfs_msg_counter;
+static u32 ecryptfs_msg_counter;
 static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
 
 /**
@@ -40,9 +42,10 @@ static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
  * @msg_ctx: The context that was acquired from the free list
  *
  * Acquires a context element from the free list and locks the mutex
- * on the context.  Returns zero on success; non-zero on error or upon
- * failure to acquire a free context element.  Be sure to lock the
- * list mutex before calling.
+ * on the context.  Sets the msg_ctx task to current.  Returns zero on
+ * success; non-zero on error or upon failure to acquire a free
+ * context element.  Must be called with ecryptfs_msg_ctx_lists_mux
+ * held.
  */
 static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx)
 {
@@ -50,11 +53,11 @@ static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx)
 	int rc;
 
 	if (list_empty(&ecryptfs_msg_ctx_free_list)) {
-		ecryptfs_printk(KERN_WARNING, "The eCryptfs free "
-				"context list is empty.  It may be helpful to "
-				"specify the ecryptfs_message_buf_len "
-				"parameter to be greater than the current "
-				"value of [%d]\n", ecryptfs_message_buf_len);
+		printk(KERN_WARNING "%s: The eCryptfs free "
+		       "context list is empty.  It may be helpful to "
+		       "specify the ecryptfs_message_buf_len "
+		       "parameter to be greater than the current "
+		       "value of [%d]\n", __func__, ecryptfs_message_buf_len);
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -75,8 +78,7 @@ out:
  * ecryptfs_msg_ctx_free_to_alloc
  * @msg_ctx: The context to move from the free list to the alloc list
  *
- * Be sure to lock the list mutex and the context mutex before
- * calling.
+ * Must be called with ecryptfs_msg_ctx_lists_mux held.
  */
 static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx)
 {
@@ -89,36 +91,39 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx)
  * ecryptfs_msg_ctx_alloc_to_free
  * @msg_ctx: The context to move from the alloc list to the free list
  *
- * Be sure to lock the list mutex and the context mutex before
- * calling.
+ * Must be called with ecryptfs_msg_ctx_lists_mux held.
  */
-static void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
+void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
 {
 	list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list);
 	if (msg_ctx->msg)
 		kfree(msg_ctx->msg);
+	msg_ctx->msg = NULL;
 	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE;
 }
 
 /**
- * ecryptfs_find_daemon_id
- * @uid: The user id which maps to the desired daemon id
- * @id: If return value is zero, points to the desired daemon id
- *      pointer
+ * ecryptfs_find_daemon_by_euid
+ * @euid: The effective user id which maps to the desired daemon id
+ * @user_ns: The namespace in which @euid applies
+ * @daemon: If return value is zero, points to the desired daemon pointer
  *
- * Search the hash list for the given user id.  Returns zero if the
- * user id exists in the list; non-zero otherwise.  The daemon id hash
- * mutex should be held before calling this function.
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Search the hash list for the given user id.
+ *
+ * Returns zero if the user id exists in the list; non-zero otherwise.
  */
-static int ecryptfs_find_daemon_id(uid_t uid, struct ecryptfs_daemon_id **id)
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+				 struct user_namespace *user_ns)
 {
 	struct hlist_node *elem;
 	int rc;
 
-	hlist_for_each_entry(*id, elem,
-			     &ecryptfs_daemon_id_hash[ecryptfs_uid_hash(uid)],
-			     id_chain) {
-		if ((*id)->uid == uid) {
+	hlist_for_each_entry(*daemon, elem,
+			     &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)],
+			     euid_chain) {
+		if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) {
 			rc = 0;
 			goto out;
 		}
@@ -128,181 +133,325 @@ out:
 	return rc;
 }
 
-static int ecryptfs_send_raw_message(unsigned int transport, u16 msg_type,
-				     pid_t pid)
+static int
+ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
+			     u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx);
+
+/**
+ * ecryptfs_send_raw_message
+ * @transport: Transport type
+ * @msg_type: Message type
+ * @daemon: Daemon struct for recipient of message
+ *
+ * A raw message is one that does not include an ecryptfs_message
+ * struct. It simply has a type.
+ *
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type,
+				     struct ecryptfs_daemon *daemon)
 {
+	struct ecryptfs_msg_ctx *msg_ctx;
 	int rc;
 
 	switch(transport) {
 	case ECRYPTFS_TRANSPORT_NETLINK:
-		rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, pid);
+		rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0,
+					   daemon->pid);
+		break;
+	case ECRYPTFS_TRANSPORT_MISCDEV:
+		rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type,
+						  &msg_ctx);
+		if (rc) {
+			printk(KERN_ERR "%s: Error whilst attempting to send "
+			       "message via procfs; rc = [%d]\n", __func__, rc);
+			goto out;
+		}
+		/* Raw messages are logically context-free (e.g., no
+		 * reply is expected), so we set the state of the
+		 * ecryptfs_msg_ctx object to indicate that it should
+		 * be freed as soon as the transport sends out the message. */
+		mutex_lock(&msg_ctx->mux);
+		msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
+		mutex_unlock(&msg_ctx->mux);
 		break;
 	case ECRYPTFS_TRANSPORT_CONNECTOR:
 	case ECRYPTFS_TRANSPORT_RELAYFS:
 	default:
 		rc = -ENOSYS;
 	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
+ * @daemon: Pointer to set to newly allocated daemon struct
+ * @euid: Effective user id for the daemon
+ * @user_ns: The namespace in which @euid applies
+ * @pid: Process id for the daemon
+ *
+ * Must be called ceremoniously while in possession of
+ * ecryptfs_sacred_daemon_hash_mux
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+		      struct user_namespace *user_ns, struct pid *pid)
+{
+	int rc = 0;
+
+	(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
+	if (!(*daemon)) {
+		rc = -ENOMEM;
+		printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+		       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
+		goto out;
+	}
+	(*daemon)->euid = euid;
+	(*daemon)->user_ns = get_user_ns(user_ns);
+	(*daemon)->pid = get_pid(pid);
+	(*daemon)->task = current;
+	mutex_init(&(*daemon)->mux);
+	INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue);
+	init_waitqueue_head(&(*daemon)->wait);
+	(*daemon)->num_queued_msg_ctx = 0;
+	hlist_add_head(&(*daemon)->euid_chain,
+		       &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]);
+out:
 	return rc;
 }
 
 /**
  * ecryptfs_process_helo
  * @transport: The underlying transport (netlink, etc.)
- * @uid: The user ID owner of the message
+ * @euid: The user ID owner of the message
+ * @user_ns: The namespace in which @euid applies
  * @pid: The process ID for the userspace program that sent the
  *       message
  *
- * Adds the uid and pid values to the daemon id hash.  If a uid
+ * Adds the euid and pid values to the daemon euid hash.  If an euid
  * already has a daemon pid registered, the daemon will be
- * unregistered before the new daemon id is put into the hash list.
- * Returns zero after adding a new daemon id to the hash list;
+ * unregistered before the new daemon is put into the hash list.
+ * Returns zero after adding a new daemon to the hash list;
  * non-zero otherwise.
  */
-int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid)
+int ecryptfs_process_helo(unsigned int transport, uid_t euid,
+			  struct user_namespace *user_ns, struct pid *pid)
 {
-	struct ecryptfs_daemon_id *new_id;
-	struct ecryptfs_daemon_id *old_id;
+	struct ecryptfs_daemon *new_daemon;
+	struct ecryptfs_daemon *old_daemon;
 	int rc;
 
-	mutex_lock(&ecryptfs_daemon_id_hash_mux);
-	new_id = kmalloc(sizeof(*new_id), GFP_KERNEL);
-	if (!new_id) {
-		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Failed to allocate memory; unable "
-				"to register daemon [%d] for user [%d]\n",
-				pid, uid);
-		goto unlock;
-	}
-	if (!ecryptfs_find_daemon_id(uid, &old_id)) {
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns);
+	if (rc != 0) {
 		printk(KERN_WARNING "Received request from user [%d] "
-		       "to register daemon [%d]; unregistering daemon "
-		       "[%d]\n", uid, pid, old_id->pid);
-		hlist_del(&old_id->id_chain);
-		rc = ecryptfs_send_raw_message(transport, ECRYPTFS_NLMSG_QUIT,
-					       old_id->pid);
+		       "to register daemon [0x%p]; unregistering daemon "
+		       "[0x%p]\n", euid, pid, old_daemon->pid);
+		rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT,
+					       old_daemon);
 		if (rc)
 			printk(KERN_WARNING "Failed to send QUIT "
-			       "message to daemon [%d]; rc = [%d]\n",
-			       old_id->pid, rc);
-		kfree(old_id);
+			       "message to daemon [0x%p]; rc = [%d]\n",
+			       old_daemon->pid, rc);
+		hlist_del(&old_daemon->euid_chain);
+		kfree(old_daemon);
 	}
-	new_id->uid = uid;
-	new_id->pid = pid;
-	hlist_add_head(&new_id->id_chain,
-		       &ecryptfs_daemon_id_hash[ecryptfs_uid_hash(uid)]);
-	rc = 0;
-unlock:
-	mutex_unlock(&ecryptfs_daemon_id_hash_mux);
+	rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid);
+	if (rc)
+		printk(KERN_ERR "%s: The gods are displeased with this attempt "
+		       "to create a new daemon object for euid [%d]; pid "
+		       "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_exorcise_daemon - Destroy the daemon struct
+ *
+ * Must be called ceremoniously while in possession of
+ * ecryptfs_daemon_hash_mux and the daemon's own mux.
+ */
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
+{
+	struct ecryptfs_msg_ctx *msg_ctx, *msg_ctx_tmp;
+	int rc = 0;
+
+	mutex_lock(&daemon->mux);
+	if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ)
+	    || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) {
+		rc = -EBUSY;
+		printk(KERN_WARNING "%s: Attempt to destroy daemon with pid "
+		       "[0x%p], but it is in the midst of a read or a poll\n",
+		       __func__, daemon->pid);
+		mutex_unlock(&daemon->mux);
+		goto out;
+	}
+	list_for_each_entry_safe(msg_ctx, msg_ctx_tmp,
+				 &daemon->msg_ctx_out_queue, daemon_out_list) {
+		list_del(&msg_ctx->daemon_out_list);
+		daemon->num_queued_msg_ctx--;
+		printk(KERN_WARNING "%s: Warning: dropping message that is in "
+		       "the out queue of a dying daemon\n", __func__);
+		ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+	}
+	hlist_del(&daemon->euid_chain);
+	if (daemon->task)
+		wake_up_process(daemon->task);
+	if (daemon->pid)
+		put_pid(daemon->pid);
+	if (daemon->user_ns)
+		put_user_ns(daemon->user_ns);
+	mutex_unlock(&daemon->mux);
+	memset(daemon, 0, sizeof(*daemon));
+	kfree(daemon);
+out:
 	return rc;
 }
 
 /**
  * ecryptfs_process_quit
- * @uid: The user ID owner of the message
+ * @euid: The user ID owner of the message
+ * @user_ns: The namespace in which @euid applies
  * @pid: The process ID for the userspace program that sent the
  *       message
  *
- * Deletes the corresponding daemon id for the given uid and pid, if
+ * Deletes the corresponding daemon for the given euid and pid, if
  * it is the registered that is requesting the deletion. Returns zero
- * after deleting the desired daemon id; non-zero otherwise.
+ * after deleting the desired daemon; non-zero otherwise.
  */
-int ecryptfs_process_quit(uid_t uid, pid_t pid)
+int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid)
 {
-	struct ecryptfs_daemon_id *id;
+	struct ecryptfs_daemon *daemon;
 	int rc;
 
-	mutex_lock(&ecryptfs_daemon_id_hash_mux);
-	if (ecryptfs_find_daemon_id(uid, &id)) {
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns);
+	if (rc || !daemon) {
 		rc = -EINVAL;
-		ecryptfs_printk(KERN_ERR, "Received request from user [%d] to "
-				"unregister unrecognized daemon [%d]\n", uid,
-				pid);
-		goto unlock;
+		printk(KERN_ERR "Received request from user [%d] to "
+		       "unregister unrecognized daemon [0x%p]\n", euid, pid);
+		goto out_unlock;
 	}
-	if (id->pid != pid) {
-		rc = -EINVAL;
-		ecryptfs_printk(KERN_WARNING, "Received request from user [%d] "
-				"with pid [%d] to unregister daemon [%d]\n",
-				uid, pid, id->pid);
-		goto unlock;
-	}
-	hlist_del(&id->id_chain);
-	kfree(id);
-	rc = 0;
-unlock:
-	mutex_unlock(&ecryptfs_daemon_id_hash_mux);
+	rc = ecryptfs_exorcise_daemon(daemon);
+out_unlock:
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	return rc;
 }
 
 /**
  * ecryptfs_process_reponse
  * @msg: The ecryptfs message received; the caller should sanity check
- *       msg->data_len
+ *       msg->data_len and free the memory
  * @pid: The process ID of the userspace application that sent the
  *       message
- * @seq: The sequence number of the message
+ * @seq: The sequence number of the message; must match the sequence
+ *       number for the existing message context waiting for this
+ *       response
+ *
+ * Processes a response message after sending an operation request to
+ * userspace. Some other process is awaiting this response. Before
+ * sending out its first communications, the other process allocated a
+ * msg_ctx from the ecryptfs_msg_ctx_arr at a particular index. The
+ * response message contains this index so that we can copy over the
+ * response message into the msg_ctx that the process holds a
+ * reference to. The other process is going to wake up, check to see
+ * that msg_ctx->state == ECRYPTFS_MSG_CTX_STATE_DONE, and then
+ * proceed to read off and process the response message. Returns zero
+ * upon delivery to desired context element; non-zero upon delivery
+ * failure or error.
  *
- * Processes a response message after sending a operation request to
- * userspace. Returns zero upon delivery to desired context element;
- * non-zero upon delivery failure or error.
+ * Returns zero on success; non-zero otherwise
  */
-int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid,
-			      pid_t pid, u32 seq)
+int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
+			      struct user_namespace *user_ns, struct pid *pid,
+			      u32 seq)
 {
-	struct ecryptfs_daemon_id *id;
+	struct ecryptfs_daemon *daemon;
 	struct ecryptfs_msg_ctx *msg_ctx;
-	int msg_size;
+	size_t msg_size;
+	struct nsproxy *nsproxy;
+	struct user_namespace *current_user_ns;
 	int rc;
 
 	if (msg->index >= ecryptfs_message_buf_len) {
 		rc = -EINVAL;
-		ecryptfs_printk(KERN_ERR, "Attempt to reference "
-				"context buffer at index [%d]; maximum "
-				"allowable is [%d]\n", msg->index,
-				(ecryptfs_message_buf_len - 1));
+		printk(KERN_ERR "%s: Attempt to reference "
+		       "context buffer at index [%d]; maximum "
+		       "allowable is [%d]\n", __func__, msg->index,
+		       (ecryptfs_message_buf_len - 1));
 		goto out;
 	}
 	msg_ctx = &ecryptfs_msg_ctx_arr[msg->index];
 	mutex_lock(&msg_ctx->mux);
-	if (ecryptfs_find_daemon_id(msg_ctx->task->euid, &id)) {
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rcu_read_lock();
+	nsproxy = task_nsproxy(msg_ctx->task);
+	if (nsproxy == NULL) {
 		rc = -EBADMSG;
-		ecryptfs_printk(KERN_WARNING, "User [%d] received a "
-				"message response from process [%d] but does "
-				"not have a registered daemon\n",
-				msg_ctx->task->euid, pid);
+		printk(KERN_ERR "%s: Receiving process is a zombie. Dropping "
+		       "message.\n", __func__);
+		rcu_read_unlock();
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto wake_up;
 	}
-	if (msg_ctx->task->euid != uid) {
+	current_user_ns = nsproxy->user_ns;
+	rc = ecryptfs_find_daemon_by_euid(&daemon, msg_ctx->task->euid,
+					  current_user_ns);
+	rcu_read_unlock();
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	if (rc) {
+		rc = -EBADMSG;
+		printk(KERN_WARNING "%s: User [%d] received a "
+		       "message response from process [0x%p] but does "
+		       "not have a registered daemon\n", __func__,
+		       msg_ctx->task->euid, pid);
+		goto wake_up;
+	}
+	if (msg_ctx->task->euid != euid) {
 		rc = -EBADMSG;
-		ecryptfs_printk(KERN_WARNING, "Received message from user "
-				"[%d]; expected message from user [%d]\n",
-				uid, msg_ctx->task->euid);
+		printk(KERN_WARNING "%s: Received message from user "
+		       "[%d]; expected message from user [%d]\n", __func__,
+		       euid, msg_ctx->task->euid);
 		goto unlock;
 	}
-	if (id->pid != pid) {
+	if (current_user_ns != user_ns) {
 		rc = -EBADMSG;
-		ecryptfs_printk(KERN_ERR, "User [%d] received a "
-				"message response from an unrecognized "
-				"process [%d]\n", msg_ctx->task->euid, pid);
+		printk(KERN_WARNING "%s: Received message from user_ns "
+		       "[0x%p]; expected message from user_ns [0x%p]\n",
+		       __func__, user_ns, nsproxy->user_ns);
+		goto unlock;
+	}
+	if (daemon->pid != pid) {
+		rc = -EBADMSG;
+		printk(KERN_ERR "%s: User [%d] sent a message response "
+		       "from an unrecognized process [0x%p]\n",
+		       __func__, msg_ctx->task->euid, pid);
 		goto unlock;
 	}
 	if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) {
 		rc = -EINVAL;
-		ecryptfs_printk(KERN_WARNING, "Desired context element is not "
-				"pending a response\n");
+		printk(KERN_WARNING "%s: Desired context element is not "
+		       "pending a response\n", __func__);
 		goto unlock;
 	} else if (msg_ctx->counter != seq) {
 		rc = -EINVAL;
-		ecryptfs_printk(KERN_WARNING, "Invalid message sequence; "
-				"expected [%d]; received [%d]\n",
-				msg_ctx->counter, seq);
+		printk(KERN_WARNING "%s: Invalid message sequence; "
+		       "expected [%d]; received [%d]\n", __func__,
+		       msg_ctx->counter, seq);
 		goto unlock;
 	}
-	msg_size = sizeof(*msg) + msg->data_len;
+	msg_size = (sizeof(*msg) + msg->data_len);
 	msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
 	if (!msg_ctx->msg) {
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n");
+		printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+		       "GFP_KERNEL memory\n", __func__, msg_size);
 		goto unlock;
 	}
 	memcpy(msg_ctx->msg, msg, msg_size);
@@ -317,34 +466,38 @@ out:
 }
 
 /**
- * ecryptfs_send_message
+ * ecryptfs_send_message_locked
  * @transport: The transport over which to send the message (i.e.,
  *             netlink)
  * @data: The data to send
  * @data_len: The length of data
  * @msg_ctx: The message context allocated for the send
+ *
+ * Must be called with ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
  */
-int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
-			  struct ecryptfs_msg_ctx **msg_ctx)
+static int
+ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
+			     u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx)
 {
-	struct ecryptfs_daemon_id *id;
+	struct ecryptfs_daemon *daemon;
 	int rc;
 
-	mutex_lock(&ecryptfs_daemon_id_hash_mux);
-	if (ecryptfs_find_daemon_id(current->euid, &id)) {
-		mutex_unlock(&ecryptfs_daemon_id_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
+	if (rc || !daemon) {
 		rc = -ENOTCONN;
-		ecryptfs_printk(KERN_ERR, "User [%d] does not have a daemon "
-				"registered\n", current->euid);
+		printk(KERN_ERR "%s: User [%d] does not have a daemon "
+		       "registered\n", __func__, current->euid);
 		goto out;
 	}
-	mutex_unlock(&ecryptfs_daemon_id_hash_mux);
 	mutex_lock(&ecryptfs_msg_ctx_lists_mux);
 	rc = ecryptfs_acquire_free_msg_ctx(msg_ctx);
 	if (rc) {
 		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
-		ecryptfs_printk(KERN_WARNING, "Could not claim a free "
-				"context element\n");
+		printk(KERN_WARNING "%s: Could not claim a free "
+		       "context element\n", __func__);
 		goto out;
 	}
 	ecryptfs_msg_ctx_free_to_alloc(*msg_ctx);
@@ -352,23 +505,50 @@ int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
 	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
 	switch (transport) {
 	case ECRYPTFS_TRANSPORT_NETLINK:
-		rc = ecryptfs_send_netlink(data, data_len, *msg_ctx,
-					   ECRYPTFS_NLMSG_REQUEST, 0, id->pid);
+		rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type,
+					   0, daemon->pid);
+		break;
+	case ECRYPTFS_TRANSPORT_MISCDEV:
+		rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type,
+					   0, daemon);
 		break;
 	case ECRYPTFS_TRANSPORT_CONNECTOR:
 	case ECRYPTFS_TRANSPORT_RELAYFS:
 	default:
 		rc = -ENOSYS;
 	}
-	if (rc) {
-		printk(KERN_ERR "Error attempting to send message to userspace "
-		       "daemon; rc = [%d]\n", rc);
-	}
+	if (rc)
+		printk(KERN_ERR "%s: Error attempting to send message to "
+		       "userspace daemon; rc = [%d]\n", __func__, rc);
 out:
 	return rc;
 }
 
 /**
+ * ecryptfs_send_message
+ * @transport: The transport over which to send the message (i.e.,
+ *             netlink)
+ * @data: The data to send
+ * @data_len: The length of data
+ * @msg_ctx: The message context allocated for the send
+ *
+ * Grabs ecryptfs_daemon_hash_mux.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
+			  struct ecryptfs_msg_ctx **msg_ctx)
+{
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_send_message_locked(transport, data, data_len,
+					  ECRYPTFS_MSG_REQUEST, msg_ctx);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
  * ecryptfs_wait_for_response
  * @msg_ctx: The context that was assigned when sending a message
  * @msg: The incoming message from userspace; not set if rc != 0
@@ -377,7 +557,7 @@ out:
  * of time exceeds ecryptfs_message_wait_timeout.  If zero is
  * returned, msg will point to a valid message from userspace; a
  * non-zero value is returned upon failure to receive a message or an
- * error occurs.
+ * error occurs. Callee must free @msg on success.
  */
 int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
 			       struct ecryptfs_message **msg)
@@ -413,32 +593,32 @@ int ecryptfs_init_messaging(unsigned int transport)
 
 	if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) {
 		ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS;
-		ecryptfs_printk(KERN_WARNING, "Specified number of users is "
-				"too large, defaulting to [%d] users\n",
-				ecryptfs_number_of_users);
+		printk(KERN_WARNING "%s: Specified number of users is "
+		       "too large, defaulting to [%d] users\n", __func__,
+		       ecryptfs_number_of_users);
 	}
-	mutex_init(&ecryptfs_daemon_id_hash_mux);
-	mutex_lock(&ecryptfs_daemon_id_hash_mux);
+	mutex_init(&ecryptfs_daemon_hash_mux);
+	mutex_lock(&ecryptfs_daemon_hash_mux);
 	ecryptfs_hash_buckets = 1;
 	while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
 		ecryptfs_hash_buckets++;
-	ecryptfs_daemon_id_hash = kmalloc(sizeof(struct hlist_head)
-					  * ecryptfs_hash_buckets, GFP_KERNEL);
-	if (!ecryptfs_daemon_id_hash) {
+	ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
+					* ecryptfs_hash_buckets), GFP_KERNEL);
+	if (!ecryptfs_daemon_hash) {
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n");
-		mutex_unlock(&ecryptfs_daemon_id_hash_mux);
+		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto out;
 	}
 	for (i = 0; i < ecryptfs_hash_buckets; i++)
-		INIT_HLIST_HEAD(&ecryptfs_daemon_id_hash[i]);
-	mutex_unlock(&ecryptfs_daemon_id_hash_mux);
-
+		INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
-				      * ecryptfs_message_buf_len), GFP_KERNEL);
+					* ecryptfs_message_buf_len),
+				       GFP_KERNEL);
 	if (!ecryptfs_msg_ctx_arr) {
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n");
+		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
 		goto out;
 	}
 	mutex_init(&ecryptfs_msg_ctx_lists_mux);
@@ -446,6 +626,7 @@ int ecryptfs_init_messaging(unsigned int transport)
 	ecryptfs_msg_counter = 0;
 	for (i = 0; i < ecryptfs_message_buf_len; i++) {
 		INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node);
+		INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].daemon_out_list);
 		mutex_init(&ecryptfs_msg_ctx_arr[i].mux);
 		mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
 		ecryptfs_msg_ctx_arr[i].index = i;
@@ -464,6 +645,11 @@ int ecryptfs_init_messaging(unsigned int transport)
 		if (rc)
 			ecryptfs_release_messaging(transport);
 		break;
+	case ECRYPTFS_TRANSPORT_MISCDEV:
+		rc = ecryptfs_init_ecryptfs_miscdev();
+		if (rc)
+			ecryptfs_release_messaging(transport);
+		break;
 	case ECRYPTFS_TRANSPORT_CONNECTOR:
 	case ECRYPTFS_TRANSPORT_RELAYFS:
 	default:
@@ -488,27 +674,37 @@ void ecryptfs_release_messaging(unsigned int transport)
 		kfree(ecryptfs_msg_ctx_arr);
 		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
 	}
-	if (ecryptfs_daemon_id_hash) {
+	if (ecryptfs_daemon_hash) {
 		struct hlist_node *elem;
-		struct ecryptfs_daemon_id *id;
+		struct ecryptfs_daemon *daemon;
 		int i;
 
-		mutex_lock(&ecryptfs_daemon_id_hash_mux);
+		mutex_lock(&ecryptfs_daemon_hash_mux);
 		for (i = 0; i < ecryptfs_hash_buckets; i++) {
-			hlist_for_each_entry(id, elem,
-					     &ecryptfs_daemon_id_hash[i],
-					     id_chain) {
-				hlist_del(elem);
-				kfree(id);
+			int rc;
+
+			hlist_for_each_entry(daemon, elem,
+					     &ecryptfs_daemon_hash[i],
+					     euid_chain) {
+				rc = ecryptfs_exorcise_daemon(daemon);
+				if (rc)
+					printk(KERN_ERR "%s: Error whilst "
+					       "attempting to destroy daemon; "
+					       "rc = [%d]. Dazed and confused, "
+					       "but trying to continue.\n",
+					       __func__, rc);
 			}
 		}
-		kfree(ecryptfs_daemon_id_hash);
-		mutex_unlock(&ecryptfs_daemon_id_hash_mux);
+		kfree(ecryptfs_daemon_hash);
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 	}
 	switch(transport) {
 	case ECRYPTFS_TRANSPORT_NETLINK:
 		ecryptfs_release_netlink();
 		break;
+	case ECRYPTFS_TRANSPORT_MISCDEV:
+		ecryptfs_destroy_ecryptfs_miscdev();
+		break;
 	case ECRYPTFS_TRANSPORT_CONNECTOR:
 	case ECRYPTFS_TRANSPORT_RELAYFS:
 	default:
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
new file mode 100644
index 00000000000..788995efd1d
--- /dev/null
+++ b/fs/ecryptfs/miscdev.c
@@ -0,0 +1,598 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include "ecryptfs_kernel.h"
+
+static atomic_t ecryptfs_num_miscdev_opens;
+
+/**
+ * ecryptfs_miscdev_poll
+ * @file: dev file (ignored)
+ * @pt: dev poll table (ignored)
+ *
+ * Returns the poll mask
+ */
+static unsigned int
+ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
+{
+	struct ecryptfs_daemon *daemon;
+	unsigned int mask = 0;
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	/* TODO: Just use file->private_data? */
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
+	BUG_ON(rc || !daemon);
+	mutex_lock(&daemon->mux);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		printk(KERN_WARNING "%s: Attempt to poll on zombified "
+		       "daemon\n", __func__);
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ)
+		goto out_unlock_daemon;
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)
+		goto out_unlock_daemon;
+	daemon->flags |= ECRYPTFS_DAEMON_IN_POLL;
+	mutex_unlock(&daemon->mux);
+	poll_wait(file, &daemon->wait, pt);
+	mutex_lock(&daemon->mux);
+	if (!list_empty(&daemon->msg_ctx_out_queue))
+		mask |= POLLIN | POLLRDNORM;
+out_unlock_daemon:
+	daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL;
+	mutex_unlock(&daemon->mux);
+	return mask;
+}
+
+/**
+ * ecryptfs_miscdev_open
+ * @inode: inode of miscdev handle (ignored)
+ * @file: file for miscdev handle (ignored)
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_miscdev_open(struct inode *inode, struct file *file)
+{
+	struct ecryptfs_daemon *daemon = NULL;
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = try_module_get(THIS_MODULE);
+	if (rc == 0) {
+		rc = -EIO;
+		printk(KERN_ERR "%s: Error attempting to increment module use "
+		       "count; rc = [%d]\n", __func__, rc);
+		goto out_unlock_daemon_list;
+	}
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
+	if (rc || !daemon) {
+		rc = ecryptfs_spawn_daemon(&daemon, current->euid,
+					   current->nsproxy->user_ns,
+					   task_pid(current));
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to spawn daemon; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out_module_put_unlock_daemon_list;
+		}
+	}
+	mutex_lock(&daemon->mux);
+	if (daemon->pid != task_pid(current)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], "
+		       "but pid [0x%p] has attempted to open the handle "
+		       "instead\n", __func__, daemon->pid, daemon->euid,
+		       task_pid(current));
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
+		rc = -EBUSY;
+		printk(KERN_ERR "%s: Miscellaneous device handle may only be "
+		       "opened once per daemon; pid [0x%p] already has this "
+		       "handle open\n", __func__, daemon->pid);
+		goto out_unlock_daemon;
+	}
+	daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN;
+	atomic_inc(&ecryptfs_num_miscdev_opens);
+out_unlock_daemon:
+	mutex_unlock(&daemon->mux);
+out_module_put_unlock_daemon_list:
+	if (rc)
+		module_put(THIS_MODULE);
+out_unlock_daemon_list:
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_release
+ * @inode: inode of fs/ecryptfs/euid handle (ignored)
+ * @file: file for fs/ecryptfs/euid handle (ignored)
+ *
+ * This keeps the daemon registered until the daemon sends another
+ * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_miscdev_release(struct inode *inode, struct file *file)
+{
+	struct ecryptfs_daemon *daemon = NULL;
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
+	BUG_ON(rc || !daemon);
+	mutex_lock(&daemon->mux);
+	BUG_ON(daemon->pid != task_pid(current));
+	BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
+	daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
+	atomic_dec(&ecryptfs_num_miscdev_opens);
+	mutex_unlock(&daemon->mux);
+	rc = ecryptfs_exorcise_daemon(daemon);
+	if (rc) {
+		printk(KERN_CRIT "%s: Fatal error whilst attempting to "
+		       "shut down daemon; rc = [%d]. Please report this "
+		       "bug.\n", __func__, rc);
+		BUG();
+	}
+	module_put(THIS_MODULE);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_send_miscdev
+ * @data: Data to send to daemon; may be NULL
+ * @data_size: Amount of data to send to daemon
+ * @msg_ctx: Message context, which is used to handle the reply. If
+ *           this is NULL, then we do not expect a reply.
+ * @msg_type: Type of message
+ * @msg_flags: Flags for message
+ * @daemon: eCryptfs daemon object
+ *
+ * Add msg_ctx to queue and then, if it exists, notify the blocked
+ * miscdevess about the data being available. Must be called with
+ * ecryptfs_daemon_hash_mux held.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_send_miscdev(char *data, size_t data_size,
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct ecryptfs_daemon *daemon)
+{
+	int rc = 0;
+
+	mutex_lock(&msg_ctx->mux);
+	if (data) {
+		msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size),
+				       GFP_KERNEL);
+		if (!msg_ctx->msg) {
+			rc = -ENOMEM;
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+			       (sizeof(*msg_ctx->msg) + data_size));
+			goto out_unlock;
+		}
+	} else
+		msg_ctx->msg = NULL;
+	msg_ctx->msg->index = msg_ctx->index;
+	msg_ctx->msg->data_len = data_size;
+	msg_ctx->type = msg_type;
+	if (data) {
+		memcpy(msg_ctx->msg->data, data, data_size);
+		msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size);
+	} else
+		msg_ctx->msg_size = 0;
+	mutex_lock(&daemon->mux);
+	list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue);
+	daemon->num_queued_msg_ctx++;
+	wake_up_interruptible(&daemon->wait);
+	mutex_unlock(&daemon->mux);
+out_unlock:
+	mutex_unlock(&msg_ctx->mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_read - format and send message from queue
+ * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
+ * @buf: User buffer into which to copy the next message on the daemon queue
+ * @count: Amount of space available in @buf
+ * @ppos: Offset in file (ignored)
+ *
+ * Pulls the most recent message from the daemon queue, formats it for
+ * being sent via a miscdevfs handle, and copies it into @buf
+ *
+ * Returns the number of bytes copied into the user buffer
+ */
+static ssize_t
+ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
+		      loff_t *ppos)
+{
+	struct ecryptfs_daemon *daemon;
+	struct ecryptfs_msg_ctx *msg_ctx;
+	size_t packet_length_size;
+	u32 counter_nbo;
+	char packet_length[3];
+	size_t i;
+	size_t total_length;
+	int rc;
+
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	/* TODO: Just use file->private_data? */
+	rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+					  current->nsproxy->user_ns);
+	BUG_ON(rc || !daemon);
+	mutex_lock(&daemon->mux);
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		rc = 0;
+		printk(KERN_WARNING "%s: Attempt to read from zombified "
+		       "daemon\n", __func__);
+		goto out_unlock_daemon;
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
+		rc = 0;
+		goto out_unlock_daemon;
+	}
+	/* This daemon will not go away so long as this flag is set */
+	daemon->flags |= ECRYPTFS_DAEMON_IN_READ;
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+check_list:
+	if (list_empty(&daemon->msg_ctx_out_queue)) {
+		mutex_unlock(&daemon->mux);
+		rc = wait_event_interruptible(
+			daemon->wait, !list_empty(&daemon->msg_ctx_out_queue));
+		mutex_lock(&daemon->mux);
+		if (rc < 0) {
+			rc = 0;
+			goto out_unlock_daemon;
+		}
+	}
+	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
+		rc = 0;
+		goto out_unlock_daemon;
+	}
+	if (list_empty(&daemon->msg_ctx_out_queue)) {
+		/* Something else jumped in since the
+		 * wait_event_interruptable() and removed the
+		 * message from the queue; try again */
+		goto check_list;
+	}
+	BUG_ON(current->euid != daemon->euid);
+	BUG_ON(current->nsproxy->user_ns != daemon->user_ns);
+	BUG_ON(task_pid(current) != daemon->pid);
+	msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
+				   struct ecryptfs_msg_ctx, daemon_out_list);
+	BUG_ON(!msg_ctx);
+	mutex_lock(&msg_ctx->mux);
+	if (msg_ctx->msg) {
+		rc = ecryptfs_write_packet_length(packet_length,
+						  msg_ctx->msg_size,
+						  &packet_length_size);
+		if (rc) {
+			rc = 0;
+			printk(KERN_WARNING "%s: Error writing packet length; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out_unlock_msg_ctx;
+		}
+	} else {
+		packet_length_size = 0;
+		msg_ctx->msg_size = 0;
+	}
+	/* miscdevfs packet format:
+	 *  Octet 0: Type
+	 *  Octets 1-4: network byte order msg_ctx->counter
+	 *  Octets 5-N0: Size of struct ecryptfs_message to follow
+	 *  Octets N0-N1: struct ecryptfs_message (including data)
+	 *
+	 *  Octets 5-N1 not written if the packet type does not
+	 *  include a message */
+	total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+	if (count < total_length) {
+		rc = 0;
+		printk(KERN_WARNING "%s: Only given user buffer of "
+		       "size [%Zd], but we need [%Zd] to read the "
+		       "pending message\n", __func__, count, total_length);
+		goto out_unlock_msg_ctx;
+	}
+	i = 0;
+	buf[i++] = msg_ctx->type;
+	counter_nbo = cpu_to_be32(msg_ctx->counter);
+	memcpy(&buf[i], (char *)&counter_nbo, 4);
+	i += 4;
+	if (msg_ctx->msg) {
+		memcpy(&buf[i], packet_length, packet_length_size);
+		i += packet_length_size;
+		rc = copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size);
+		if (rc) {
+			printk(KERN_ERR "%s: copy_to_user returned error "
+			       "[%d]\n", __func__, rc);
+			goto out_unlock_msg_ctx;
+		}
+		i += msg_ctx->msg_size;
+	}
+	rc = i;
+	list_del(&msg_ctx->daemon_out_list);
+	kfree(msg_ctx->msg);
+	msg_ctx->msg = NULL;
+	/* We do not expect a reply from the userspace daemon for any
+	 * message type other than ECRYPTFS_MSG_REQUEST */
+	if (msg_ctx->type != ECRYPTFS_MSG_REQUEST)
+		ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
+out_unlock_msg_ctx:
+	mutex_unlock(&msg_ctx->mux);
+out_unlock_daemon:
+	daemon->flags &= ~ECRYPTFS_DAEMON_IN_READ;
+	mutex_unlock(&daemon->mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_helo
+ * @euid: effective user id of miscdevess sending helo packet
+ * @user_ns: The namespace in which @euid applies
+ * @pid: miscdevess id of miscdevess sending helo packet
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns,
+				 struct pid *pid)
+{
+	int rc;
+
+	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns,
+				   pid);
+	if (rc)
+		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_quit
+ * @euid: effective user id of miscdevess sending quit packet
+ * @user_ns: The namespace in which @euid applies
+ * @pid: miscdevess id of miscdevess sending quit packet
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns,
+				 struct pid *pid)
+{
+	int rc;
+
+	rc = ecryptfs_process_quit(euid, user_ns, pid);
+	if (rc)
+		printk(KERN_WARNING
+		       "Error processing QUIT message; rc = [%d]\n", rc);
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
+ * @data: Bytes comprising struct ecryptfs_message
+ * @data_size: sizeof(struct ecryptfs_message) + data len
+ * @euid: Effective user id of miscdevess sending the miscdev response
+ * @user_ns: The namespace in which @euid applies
+ * @pid: Miscdevess id of miscdevess sending the miscdev response
+ * @seq: Sequence number for miscdev response packet
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_miscdev_response(char *data, size_t data_size,
+				     uid_t euid, struct user_namespace *user_ns,
+				     struct pid *pid, u32 seq)
+{
+	struct ecryptfs_message *msg = (struct ecryptfs_message *)data;
+	int rc;
+
+	if ((sizeof(*msg) + msg->data_len) != data_size) {
+		printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
+		       "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+		       (sizeof(*msg) + msg->data_len), data_size);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq);
+	if (rc)
+		printk(KERN_ERR
+		       "Error processing response message; rc = [%d]\n", rc);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_miscdev_write - handle write to daemon miscdev handle
+ * @file: File for misc dev handle (ignored)
+ * @buf: Buffer containing user data
+ * @count: Amount of data in @buf
+ * @ppos: Pointer to offset in file (ignored)
+ *
+ * miscdevfs packet format:
+ *  Octet 0: Type
+ *  Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
+ *  Octets 5-N0: Size of struct ecryptfs_message to follow
+ *  Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ * Returns the number of bytes read from @buf
+ */
+static ssize_t
+ecryptfs_miscdev_write(struct file *file, const char __user *buf,
+		       size_t count, loff_t *ppos)
+{
+	u32 counter_nbo, seq;
+	size_t packet_size, packet_size_length, i;
+	ssize_t sz = 0;
+	char *data;
+	int rc;
+
+	if (count == 0)
+		goto out;
+	data = kmalloc(count, GFP_KERNEL);
+	if (!data) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+		goto out;
+	}
+	rc = copy_from_user(data, buf, count);
+	if (rc) {
+		printk(KERN_ERR "%s: copy_from_user returned error [%d]\n",
+		       __func__, rc);
+		goto out_free;
+	}
+	sz = count;
+	i = 0;
+	switch (data[i++]) {
+	case ECRYPTFS_MSG_RESPONSE:
+		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+			printk(KERN_WARNING "%s: Minimum acceptable packet "
+			       "size is [%Zd], but amount of data written is "
+			       "only [%Zd]. Discarding response packet.\n",
+			       __func__,
+			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
+			       count);
+			goto out_free;
+		}
+		memcpy((char *)&counter_nbo, &data[i], 4);
+		seq = be32_to_cpu(counter_nbo);
+		i += 4;
+		rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
+						  &packet_size_length);
+		if (rc) {
+			printk(KERN_WARNING "%s: Error parsing packet length; "
+			       "rc = [%d]\n", __func__, rc);
+			goto out_free;
+		}
+		i += packet_size_length;
+		if ((1 + 4 + packet_size_length + packet_size) != count) {
+			printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
+			       " + packet_size([%Zd]))([%Zd]) != "
+			       "count([%Zd]). Invalid packet format.\n",
+			       __func__, packet_size_length, packet_size,
+			       (1 + packet_size_length + packet_size), count);
+			goto out_free;
+		}
+		rc = ecryptfs_miscdev_response(&data[i], packet_size,
+					       current->euid,
+					       current->nsproxy->user_ns,
+					       task_pid(current), seq);
+		if (rc)
+			printk(KERN_WARNING "%s: Failed to deliver miscdev "
+			       "response to requesting operation; rc = [%d]\n",
+			       __func__, rc);
+		break;
+	case ECRYPTFS_MSG_HELO:
+		rc = ecryptfs_miscdev_helo(current->euid,
+					   current->nsproxy->user_ns,
+					   task_pid(current));
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to process "
+			       "helo from pid [0x%p]; rc = [%d]\n", __func__,
+			       task_pid(current), rc);
+			goto out_free;
+		}
+		break;
+	case ECRYPTFS_MSG_QUIT:
+		rc = ecryptfs_miscdev_quit(current->euid,
+					   current->nsproxy->user_ns,
+					   task_pid(current));
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to process "
+			       "quit from pid [0x%p]; rc = [%d]\n", __func__,
+			       task_pid(current), rc);
+			goto out_free;
+		}
+		break;
+	default:
+		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
+				"message of unrecognized type [%d]\n",
+				data[0]);
+		break;
+	}
+out_free:
+	kfree(data);
+out:
+	return sz;
+}
+
+
+static const struct file_operations ecryptfs_miscdev_fops = {
+	.open    = ecryptfs_miscdev_open,
+	.poll    = ecryptfs_miscdev_poll,
+	.read    = ecryptfs_miscdev_read,
+	.write   = ecryptfs_miscdev_write,
+	.release = ecryptfs_miscdev_release,
+};
+
+static struct miscdevice ecryptfs_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name  = "ecryptfs",
+	.fops  = &ecryptfs_miscdev_fops
+};
+
+/**
+ * ecryptfs_init_ecryptfs_miscdev
+ *
+ * Messages sent to the userspace daemon from the kernel are placed on
+ * a queue associated with the daemon. The next read against the
+ * miscdev handle by that daemon will return the oldest message placed
+ * on the message queue for the daemon.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_init_ecryptfs_miscdev(void)
+{
+	int rc;
+
+	atomic_set(&ecryptfs_num_miscdev_opens, 0);
+	mutex_lock(&ecryptfs_daemon_hash_mux);
+	rc = misc_register(&ecryptfs_miscdev);
+	if (rc)
+		printk(KERN_ERR "%s: Failed to register miscellaneous device "
+		       "for communications with userspace daemons; rc = [%d]\n",
+		       __func__, rc);
+	mutex_unlock(&ecryptfs_daemon_hash_mux);
+	return rc;
+}
+
+/**
+ * ecryptfs_destroy_ecryptfs_miscdev
+ *
+ * All of the daemons must be exorcised prior to calling this
+ * function.
+ */
+void ecryptfs_destroy_ecryptfs_miscdev(void)
+{
+	BUG_ON(atomic_read(&ecryptfs_num_miscdev_opens) != 0);
+	misc_deregister(&ecryptfs_miscdev);
+}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6df1debdccc..2b6fe1e6e8b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -153,7 +153,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			flush_dcache_page(page);
 			if (rc) {
 				printk(KERN_ERR "%s: Error reading xattr "
-				       "region; rc = [%d]\n", __FUNCTION__, rc);
+				       "region; rc = [%d]\n", __func__, rc);
 				goto out;
 			}
 		} else {
@@ -169,7 +169,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			if (rc) {
 				printk(KERN_ERR "%s: Error attempting to read "
 				       "extent at offset [%lld] in the lower "
-				       "file; rc = [%d]\n", __FUNCTION__,
+				       "file; rc = [%d]\n", __func__,
 				       lower_offset, rc);
 				goto out;
 			}
@@ -212,7 +212,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
 				       "the encrypted content from the lower "
 				       "file whilst inserting the metadata "
 				       "from the xattr into the header; rc = "
-				       "[%d]\n", __FUNCTION__, rc);
+				       "[%d]\n", __func__, rc);
 				goto out;
 			}
 
@@ -293,7 +293,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 			if (rc) {
 				printk(KERN_ERR "%s: Error attemping to read "
 				       "lower page segment; rc = [%d]\n",
-				       __FUNCTION__, rc);
+				       __func__, rc);
 				ClearPageUptodate(page);
 				goto out;
 			} else
@@ -308,7 +308,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 					       "from the lower file whilst "
 					       "inserting the metadata from "
 					       "the xattr into the header; rc "
-					       "= [%d]\n", __FUNCTION__, rc);
+					       "= [%d]\n", __func__, rc);
 					ClearPageUptodate(page);
 					goto out;
 				}
@@ -320,7 +320,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 				if (rc) {
 					printk(KERN_ERR "%s: Error reading "
 					       "page; rc = [%d]\n",
-					       __FUNCTION__, rc);
+					       __func__, rc);
 					ClearPageUptodate(page);
 					goto out;
 				}
@@ -331,7 +331,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 			if (rc) {
 				printk(KERN_ERR "%s: Error decrypting page "
 				       "at index [%ld]; rc = [%d]\n",
-				       __FUNCTION__, page->index, rc);
+				       __func__, page->index, rc);
 				ClearPageUptodate(page);
 				goto out;
 			}
@@ -348,7 +348,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 			if (rc) {
 				printk(KERN_ERR "%s: Error on attempt to "
 				       "truncate to (higher) offset [%lld];"
-				       " rc = [%d]\n", __FUNCTION__,
+				       " rc = [%d]\n", __func__,
 				       prev_page_end_size, rc);
 				goto out;
 			}
@@ -389,7 +389,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
 	kfree(file_size_virt);
 	if (rc)
 		printk(KERN_ERR "%s: Error writing file size to header; "
-		       "rc = [%d]\n", __FUNCTION__, rc);
+		       "rc = [%d]\n", __func__, rc);
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index f638a698dc5..e0abad62b39 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -44,8 +44,8 @@ static struct sock *ecryptfs_nl_sock;
  * upon sending the message; non-zero upon error.
  */
 int ecryptfs_send_netlink(char *data, int data_len,
-			  struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type,
-			  u16 msg_flags, pid_t daemon_pid)
+			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
+			  u16 msg_flags, struct pid *daemon_pid)
 {
 	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
@@ -60,7 +60,7 @@ int ecryptfs_send_netlink(char *data, int data_len,
 		ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n");
 		goto out;
 	}
-	nlh = NLMSG_PUT(skb, daemon_pid, msg_ctx ? msg_ctx->counter : 0,
+	nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0,
 			msg_type, payload_len);
 	nlh->nlmsg_flags = msg_flags;
 	if (msg_ctx && payload_len) {
@@ -69,7 +69,7 @@ int ecryptfs_send_netlink(char *data, int data_len,
 		msg->data_len = data_len;
 		memcpy(msg->data, data, data_len);
 	}
-	rc = netlink_unicast(ecryptfs_nl_sock, skb, daemon_pid, 0);
+	rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink "
 				"message; rc = [%d]\n", rc);
@@ -99,6 +99,7 @@ static int ecryptfs_process_nl_response(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
 	struct ecryptfs_message *msg = NLMSG_DATA(nlh);
+	struct pid *pid;
 	int rc;
 
 	if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) {
@@ -107,8 +108,10 @@ static int ecryptfs_process_nl_response(struct sk_buff *skb)
 				"incorrectly specified data length\n");
 		goto out;
 	}
-	rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid,
-				       NETLINK_CREDS(skb)->pid, nlh->nlmsg_seq);
+	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
+	rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL,
+				       pid, nlh->nlmsg_seq);
+	put_pid(pid);
 	if (rc)
 		printk(KERN_ERR
 		       "Error processing response message; rc = [%d]\n", rc);
@@ -126,11 +129,13 @@ out:
  */
 static int ecryptfs_process_nl_helo(struct sk_buff *skb)
 {
+	struct pid *pid;
 	int rc;
 
+	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
 	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK,
-				   NETLINK_CREDS(skb)->uid,
-				   NETLINK_CREDS(skb)->pid);
+				   NETLINK_CREDS(skb)->uid, NULL, pid);
+	put_pid(pid);
 	if (rc)
 		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
 	return rc;
@@ -147,10 +152,12 @@ static int ecryptfs_process_nl_helo(struct sk_buff *skb)
  */
 static int ecryptfs_process_nl_quit(struct sk_buff *skb)
 {
+	struct pid *pid;
 	int rc;
 
-	rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid,
-				   NETLINK_CREDS(skb)->pid);
+	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
+	rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid);
+	put_pid(pid);
 	if (rc)
 		printk(KERN_WARNING
 		       "Error processing QUIT message; rc = [%d]\n", rc);
@@ -176,20 +183,20 @@ static void ecryptfs_receive_nl_message(struct sk_buff *skb)
 		goto free;
 	}
 	switch (nlh->nlmsg_type) {
-		case ECRYPTFS_NLMSG_RESPONSE:
+		case ECRYPTFS_MSG_RESPONSE:
 			if (ecryptfs_process_nl_response(skb)) {
 				ecryptfs_printk(KERN_WARNING, "Failed to "
 						"deliver netlink response to "
 						"requesting operation\n");
 			}
 			break;
-		case ECRYPTFS_NLMSG_HELO:
+		case ECRYPTFS_MSG_HELO:
 			if (ecryptfs_process_nl_helo(skb)) {
 				ecryptfs_printk(KERN_WARNING, "Failed to "
 						"fulfill HELO request\n");
 			}
 			break;
-		case ECRYPTFS_NLMSG_QUIT:
+		case ECRYPTFS_MSG_QUIT:
 			if (ecryptfs_process_nl_quit(skb)) {
 				ecryptfs_printk(KERN_WARNING, "Failed to "
 						"fulfill QUIT request\n");
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0c4928623bb..ebf55150be5 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -55,7 +55,7 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 	set_fs(fs_save);
 	if (octets_written < 0) {
 		printk(KERN_ERR "%s: octets_written = [%td]; "
-		       "expected [%td]\n", __FUNCTION__, octets_written, size);
+		       "expected [%td]\n", __func__, octets_written, size);
 		rc = -EINVAL;
 	}
 	mutex_unlock(&inode_info->lower_file_mutex);
@@ -153,7 +153,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 			rc = PTR_ERR(ecryptfs_page);
 			printk(KERN_ERR "%s: Error getting page at "
 			       "index [%ld] from eCryptfs inode "
-			       "mapping; rc = [%d]\n", __FUNCTION__,
+			       "mapping; rc = [%d]\n", __func__,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
@@ -165,7 +165,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 			if (rc) {
 				printk(KERN_ERR "%s: Error decrypting "
 				       "page; rc = [%d]\n",
-				       __FUNCTION__, rc);
+				       __func__, rc);
 				ClearPageUptodate(ecryptfs_page);
 				page_cache_release(ecryptfs_page);
 				goto out;
@@ -202,7 +202,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		page_cache_release(ecryptfs_page);
 		if (rc) {
 			printk(KERN_ERR "%s: Error encrypting "
-			       "page; rc = [%d]\n", __FUNCTION__, rc);
+			       "page; rc = [%d]\n", __func__, rc);
 			goto out;
 		}
 		pos += num_bytes;
@@ -254,7 +254,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
 	set_fs(fs_save);
 	if (octets_read < 0) {
 		printk(KERN_ERR "%s: octets_read = [%td]; "
-		       "expected [%td]\n", __FUNCTION__, octets_read, size);
+		       "expected [%td]\n", __func__, octets_read, size);
 		rc = -EINVAL;
 	}
 	mutex_unlock(&inode_info->lower_file_mutex);
@@ -327,7 +327,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
 		printk(KERN_ERR "%s: Attempt to read data past the end of the "
 			"file; offset = [%lld]; size = [%td]; "
 		       "ecryptfs_file_size = [%lld]\n",
-		       __FUNCTION__, offset, size, ecryptfs_file_size);
+		       __func__, offset, size, ecryptfs_file_size);
 		goto out;
 	}
 	pos = offset;
@@ -345,14 +345,14 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
 			rc = PTR_ERR(ecryptfs_page);
 			printk(KERN_ERR "%s: Error getting page at "
 			       "index [%ld] from eCryptfs inode "
-			       "mapping; rc = [%d]\n", __FUNCTION__,
+			       "mapping; rc = [%d]\n", __func__,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
 		rc = ecryptfs_decrypt_page(ecryptfs_page);
 		if (rc) {
 			printk(KERN_ERR "%s: Error decrypting "
-			       "page; rc = [%d]\n", __FUNCTION__, rc);
+			       "page; rc = [%d]\n", __func__, rc);
 			ClearPageUptodate(ecryptfs_page);
 			page_cache_release(ecryptfs_page);
 			goto out;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a415f42d32c..221086fef17 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -257,25 +257,6 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
 }
 
-/* Special initialization for the RB tree node to detect linkage */
-static inline void ep_rb_initnode(struct rb_node *n)
-{
-	rb_set_parent(n, n);
-}
-
-/* Removes a node from the RB tree and marks it for a fast is-linked check */
-static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
-{
-	rb_erase(n, r);
-	rb_set_parent(n, n);
-}
-
-/* Fast check to verify that the item is linked to the main RB tree */
-static inline int ep_rb_linked(struct rb_node *n)
-{
-	return rb_parent(n) != n;
-}
-
 /* Tells us if the item is currently linked */
 static inline int ep_is_linked(struct list_head *p)
 {
@@ -283,13 +264,13 @@ static inline int ep_is_linked(struct list_head *p)
 }
 
 /* Get the "struct epitem" from a wait queue pointer */
-static inline struct epitem * ep_item_from_wait(wait_queue_t *p)
+static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
 {
 	return container_of(p, struct eppoll_entry, wait)->base;
 }
 
 /* Get the "struct epitem" from an epoll queue wrapper */
-static inline struct epitem * ep_item_from_epqueue(poll_table *p)
+static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 {
 	return container_of(p, struct ep_pqueue, pt)->epi;
 }
@@ -411,8 +392,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 		list_del_init(&epi->fllink);
 	spin_unlock(&file->f_ep_lock);
 
-	if (ep_rb_linked(&epi->rbn))
-		ep_rb_erase(&epi->rbn, &ep->rbr);
+	rb_erase(&epi->rbn, &ep->rbr);
 
 	spin_lock_irqsave(&ep->lock, flags);
 	if (ep_is_linked(&epi->rdllink))
@@ -728,7 +708,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		goto error_return;
 
 	/* Item initialization follow here ... */
-	ep_rb_initnode(&epi->rbn);
 	INIT_LIST_HEAD(&epi->rdllink);
 	INIT_LIST_HEAD(&epi->fllink);
 	INIT_LIST_HEAD(&epi->pwqlist);
@@ -1262,7 +1241,7 @@ error_return:
 	return error;
 }
 
-#ifdef TIF_RESTORE_SIGMASK
+#ifdef HAVE_SET_RESTORE_SIGMASK
 
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
@@ -1300,7 +1279,7 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 		if (error == -EINTR) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 			       sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		} else
 			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
@@ -1308,7 +1287,7 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 	return error;
 }
 
-#endif /* #ifdef TIF_RESTORE_SIGMASK */
+#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 static int __init eventpoll_init(void)
 {
@@ -1330,4 +1309,3 @@ static int __init eventpoll_init(void)
 	return 0;
 }
 fs_initcall(eventpoll_init);
-
diff --git a/fs/exec.c b/fs/exec.c
index b152029f18f..9f9f931ef94 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -735,6 +735,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
+	mm_update_next_owner(mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
@@ -765,9 +766,7 @@ static int de_thread(struct task_struct *tsk)
 
 	/*
 	 * Kill all other threads in the thread group.
-	 * We must hold tasklist_lock to call zap_other_threads.
 	 */
-	read_lock(&tasklist_lock);
 	spin_lock_irq(lock);
 	if (signal_group_exit(sig)) {
 		/*
@@ -775,21 +774,10 @@ static int de_thread(struct task_struct *tsk)
 		 * return so that the signal is processed.
 		 */
 		spin_unlock_irq(lock);
-		read_unlock(&tasklist_lock);
 		return -EAGAIN;
 	}
-
-	/*
-	 * child_reaper ignores SIGKILL, change it now.
-	 * Reparenting needs write_lock on tasklist_lock,
-	 * so it is safe to do it under read_lock.
-	 */
-	if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
-		task_active_pid_ns(tsk)->child_reaper = tsk;
-
 	sig->group_exit_task = tsk;
 	zap_other_threads(tsk);
-	read_unlock(&tasklist_lock);
 
 	/* Account for the thread group leader hanging around: */
 	count = thread_group_leader(tsk) ? 1 : 2;
@@ -810,7 +798,7 @@ static int de_thread(struct task_struct *tsk)
 	if (!thread_group_leader(tsk)) {
 		leader = tsk->group_leader;
 
-		sig->notify_count = -1;
+		sig->notify_count = -1;	/* for exit_notify() */
 		for (;;) {
 			write_lock_irq(&tasklist_lock);
 			if (likely(leader->exit_state))
@@ -820,6 +808,8 @@ static int de_thread(struct task_struct *tsk)
 			schedule();
 		}
 
+		if (unlikely(task_child_reaper(tsk) == leader))
+			task_active_pid_ns(tsk)->child_reaper = tsk;
 		/*
 		 * The only record we have of the real-time age of a
 		 * process, regardless of execs it's done, is start_time.
@@ -963,6 +953,8 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+	set_mm_exe_file(bprm->mm, bprm->file);
+
 	/*
 	 * Release all of the old mmap stuff
 	 */
@@ -1268,7 +1260,6 @@ int do_execve(char * filename,
 {
 	struct linux_binprm *bprm;
 	struct file *file;
-	unsigned long env_p;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1321,11 +1312,9 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	env_p = bprm->p;
 	retval = copy_strings(bprm->argc, argv, bprm);
 	if (retval < 0)
 		goto out;
-	bprm->argv_len = env_p - bprm->p;
 
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 109ab5e44ec..cc91227d3bb 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -150,12 +150,12 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir)
 			if (IS_ERR(ppd)) {
 				err = PTR_ERR(ppd);
 				dprintk("%s: get_parent of %ld failed, err %d\n",
-					__FUNCTION__, pd->d_inode->i_ino, err);
+					__func__, pd->d_inode->i_ino, err);
 				dput(pd);
 				break;
 			}
 
-			dprintk("%s: find name of %lu in %lu\n", __FUNCTION__,
+			dprintk("%s: find name of %lu in %lu\n", __func__,
 				pd->d_inode->i_ino, ppd->d_inode->i_ino);
 			err = exportfs_get_name(mnt, ppd, nbuf, pd);
 			if (err) {
@@ -168,14 +168,14 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir)
 					continue;
 				break;
 			}
-			dprintk("%s: found name: %s\n", __FUNCTION__, nbuf);
+			dprintk("%s: found name: %s\n", __func__, nbuf);
 			mutex_lock(&ppd->d_inode->i_mutex);
 			npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
 			mutex_unlock(&ppd->d_inode->i_mutex);
 			if (IS_ERR(npd)) {
 				err = PTR_ERR(npd);
 				dprintk("%s: lookup failed: %d\n",
-					__FUNCTION__, err);
+					__func__, err);
 				dput(ppd);
 				dput(pd);
 				break;
@@ -188,7 +188,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir)
 			if (npd == pd)
 				noprogress = 0;
 			else
-				printk("%s: npd != pd\n", __FUNCTION__);
+				printk("%s: npd != pd\n", __func__);
 			dput(npd);
 			dput(ppd);
 			if (IS_ROOT(pd)) {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cc47b76091b..6ae4ecf3ce4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1261,10 +1261,11 @@ static int ext3_ordered_write_end(struct file *file,
 		new_i_size = pos + copied;
 		if (new_i_size > EXT3_I(inode)->i_disksize)
 			EXT3_I(inode)->i_disksize = new_i_size;
-		copied = ext3_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-		if (copied < 0)
-			ret = copied;
+		copied = ret2;
+		if (ret2 < 0)
+			ret = ret2;
 	}
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
@@ -1289,10 +1290,11 @@ static int ext3_writeback_write_end(struct file *file,
 	if (new_i_size > EXT3_I(inode)->i_disksize)
 		EXT3_I(inode)->i_disksize = new_i_size;
 
-	copied = ext3_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-	if (copied < 0)
-		ret = copied;
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
 
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a8bae8cd1d5..3c8dab880d9 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -9,8 +9,8 @@
 #include <linux/slab.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -37,7 +37,7 @@ ext4_acl_from_disk(const void *value, size_t size)
 		return ERR_PTR(-EINVAL);
 	if (count == 0)
 		return NULL;
-	acl = posix_acl_alloc(count, GFP_KERNEL);
+	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
 	for (n=0; n < count; n++) {
@@ -91,7 +91,7 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
 
 	*size = ext4_acl_size(acl->a_count);
 	ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
-			sizeof(ext4_acl_entry), GFP_KERNEL);
+			sizeof(ext4_acl_entry), GFP_NOFS);
 	if (!ext_acl)
 		return ERR_PTR(-ENOMEM);
 	ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
@@ -187,7 +187,7 @@ ext4_get_acl(struct inode *inode, int type)
 	}
 	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
-		value = kmalloc(retval, GFP_KERNEL);
+		value = kmalloc(retval, GFP_NOFS);
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		retval = ext4_xattr_get(inode, name_index, "", value, retval);
@@ -335,7 +335,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 			if (error)
 				goto cleanup;
 		}
-		clone = posix_acl_clone(acl, GFP_KERNEL);
+		clone = posix_acl_clone(acl, GFP_NOFS);
 		error = -ENOMEM;
 		if (!clone)
 			goto cleanup;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0737e05ba3d..da994374ec3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -15,12 +15,12 @@
 #include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
-
+#include "ext4.h"
+#include "ext4_jbd2.h"
 #include "group.h"
+
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -48,7 +48,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
-	unsigned long start;
 	int bit, bit_max;
 	unsigned free_blocks, group_blocks;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -59,7 +58,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		/* If checksum is bad mark all blocks used to prevent allocation
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-			ext4_error(sb, __FUNCTION__,
+			ext4_error(sb, __func__,
 				  "Checksum bad for group %lu\n", block_group);
 			gdp->bg_free_blocks_count = 0;
 			gdp->bg_free_inodes_count = 0;
@@ -106,11 +105,12 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 	free_blocks = group_blocks - bit_max;
 
 	if (bh) {
+		ext4_fsblk_t start;
+
 		for (bit = 0; bit < bit_max; bit++)
 			ext4_set_bit(bit, bh->b_data);
 
-		start = block_group * EXT4_BLOCKS_PER_GROUP(sb) +
-			le32_to_cpu(sbi->s_es->s_first_data_block);
+		start = ext4_group_first_block_no(sb, block_group);
 
 		/* Set bits for block and inode bitmaps, and inode table */
 		ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
@@ -235,7 +235,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
 		return 1;
 
 err_out:
-	ext4_error(sb, __FUNCTION__,
+	ext4_error(sb, __func__,
 			"Invalid block bitmap - "
 			"block_group = %d, block = %llu",
 			block_group, bitmap_blk);
@@ -264,7 +264,7 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	bitmap_blk = ext4_block_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
 			    "block_group = %d, block_bitmap = %llu",
 			    (int)block_group, (unsigned long long)bitmap_blk);
@@ -281,7 +281,7 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	}
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
 			    "block_group = %d, block_bitmap = %llu",
 			    (int)block_group, (unsigned long long)bitmap_blk);
@@ -360,7 +360,7 @@ restart:
 		BUG();
 }
 #define rsv_window_dump(root, verbose) \
-	__rsv_window_dump((root), (verbose), __FUNCTION__)
+	__rsv_window_dump((root), (verbose), __func__)
 #else
 #define rsv_window_dump(root, verbose) do {} while (0)
 #endif
@@ -740,7 +740,7 @@ do_more:
 		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 						bit + i, bitmap_bh->b_data)) {
 			jbd_unlock_bh_state(bitmap_bh);
-			ext4_error(sb, __FUNCTION__,
+			ext4_error(sb, __func__,
 				   "bit already cleared for block %llu",
 				   (ext4_fsblk_t)(block + i));
 			jbd_lock_bh_state(bitmap_bh);
@@ -752,9 +752,7 @@ do_more:
 	jbd_unlock_bh_state(bitmap_bh);
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	desc->bg_free_blocks_count =
-		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
-			group_freed);
+	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -1798,7 +1796,7 @@ allocated:
 			if (ext4_test_bit(grp_alloc_blk+i,
 					bh2jh(bitmap_bh)->b_committed_data)) {
 				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __FUNCTION__);
+					"b_committed_data\n", __func__);
 			}
 		}
 	}
@@ -1823,8 +1821,7 @@ allocated:
 	spin_lock(sb_bgl_lock(sbi, group_no));
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-	gdp->bg_free_blocks_count =
-			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
+	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 420554f8f79..d37ea675045 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -9,7 +9,7 @@
 
 #include <linux/buffer_head.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4.h"
 
 #ifdef EXT4FS_DEBUG
 
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2c23bade9aa..2bf0331ea19 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -23,10 +23,10 @@
 
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
+#include "ext4.h"
 
 static unsigned char ext4_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -42,7 +42,7 @@ const struct file_operations ext4_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext4_readdir,		/* we take BKL. needed?*/
-	.ioctl		= ext4_ioctl,		/* BKL held */
+	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
new file mode 100644
index 00000000000..8158083f7ac
--- /dev/null
+++ b/fs/ext4/ext4.h
@@ -0,0 +1,1205 @@
+/*
+ *  ext4.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _EXT4_H
+#define _EXT4_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/magic.h>
+#include "ext4_i.h"
+
+/*
+ * The second extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+#undef EXT4FS_DEBUG
+
+/*
+ * Define EXT4_RESERVATION to reserve data blocks for expanding files
+ */
+#define EXT4_DEFAULT_RESERVE_BLOCKS	8
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT4_MAX_RESERVE_BLOCKS		1027
+#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
+
+/*
+ * Debug code
+ */
+#ifdef EXT4FS_DEBUG
+#define ext4_debug(f, a...)						\
+	do {								\
+		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+			__FILE__, __LINE__, __FUNCTION__);		\
+		printk (KERN_DEBUG f, ## a);				\
+	} while (0)
+#else
+#define ext4_debug(f, a...)	do {} while (0)
+#endif
+
+#define EXT4_MULTIBLOCK_ALLOCATOR	1
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE		1
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED		2
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA		4
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST		8
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST		16
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA		32
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC		64
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC	128
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY		256
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL		512
+
+struct ext4_allocation_request {
+	/* target inode for block we're allocating */
+	struct inode *inode;
+	/* logical block in target inode */
+	ext4_lblk_t logical;
+	/* phys. target (a hint) */
+	ext4_fsblk_t goal;
+	/* the closest logical allocated block to the left */
+	ext4_lblk_t lleft;
+	/* phys. block for ^^^ */
+	ext4_fsblk_t pleft;
+	/* the closest logical allocated block to the right */
+	ext4_lblk_t lright;
+	/* phys. block for ^^^ */
+	ext4_fsblk_t pright;
+	/* how many blocks we want to allocate */
+	unsigned long len;
+	/* flags. see above EXT4_MB_HINT_* */
+	unsigned long flags;
+};
+
+/*
+ * Special inodes numbers
+ */
+#define	EXT4_BAD_INO		 1	/* Bad blocks inode */
+#define EXT4_ROOT_INO		 2	/* Root inode */
+#define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
+#define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+#define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
+#define EXT4_JOURNAL_INO	 8	/* Journal inode */
+
+/* First non-reserved inode for old ext4 filesystems */
+#define EXT4_GOOD_OLD_FIRST_INO	11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT4_LINK_MAX		65000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT4_MIN_BLOCK_SIZE		1024
+#define	EXT4_MAX_BLOCK_SIZE		65536
+#define EXT4_MIN_BLOCK_LOG_SIZE		10
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
+#else
+# define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+#endif
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof (__u32))
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+#else
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
+#endif
+#ifdef __KERNEL__
+#define	EXT4_ADDR_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_addr_per_block_bits)
+#define EXT4_INODE_SIZE(s)		(EXT4_SB(s)->s_inode_size)
+#define EXT4_FIRST_INO(s)		(EXT4_SB(s)->s_first_ino)
+#else
+#define EXT4_INODE_SIZE(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_INODE_SIZE : \
+				 (s)->s_inode_size)
+#define EXT4_FIRST_INO(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_FIRST_INO : \
+				 (s)->s_first_ino)
+#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext4_group_desc
+{
+	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
+	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
+	__le32	bg_inode_table_lo;	/* Inodes table block */
+	__le16	bg_free_blocks_count;	/* Free blocks count */
+	__le16	bg_free_inodes_count;	/* Free inodes count */
+	__le16	bg_used_dirs_count;	/* Directories count */
+	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
+	__u32	bg_reserved[2];		/* Likely block/inode bitmap checksum */
+	__le16  bg_itable_unused;	/* Unused inodes count */
+	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
+	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
+	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
+	__le32	bg_inode_table_hi;	/* Inodes table block MSB */
+	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
+	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
+	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
+	__le16	bg_itable_unused_hi;	/* Unused inodes count MSB */
+	__u32	bg_reserved2[3];
+};
+
+#define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
+#define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
+#define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+
+#ifdef __KERNEL__
+#include "ext4_sb.h"
+#endif
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT4_MIN_DESC_SIZE		32
+#define EXT4_MIN_DESC_SIZE_64BIT	64
+#define	EXT4_MAX_DESC_SIZE		EXT4_MIN_BLOCK_SIZE
+#define EXT4_DESC_SIZE(s)		(EXT4_SB(s)->s_desc_size)
+#ifdef __KERNEL__
+# define EXT4_BLOCKS_PER_GROUP(s)	(EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_SB(s)->s_desc_per_block)
+# define EXT4_INODES_PER_GROUP(s)	(EXT4_SB(s)->s_inodes_per_group)
+# define EXT4_DESC_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_desc_per_block_bits)
+#else
+# define EXT4_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
+# define EXT4_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
+#endif
+
+/*
+ * Constants relative to the data blocks
+ */
+#define	EXT4_NDIR_BLOCKS		12
+#define	EXT4_IND_BLOCK			EXT4_NDIR_BLOCKS
+#define	EXT4_DIND_BLOCK			(EXT4_IND_BLOCK + 1)
+#define	EXT4_TIND_BLOCK			(EXT4_DIND_BLOCK + 1)
+#define	EXT4_N_BLOCKS			(EXT4_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define	EXT4_SECRM_FL			0x00000001 /* Secure deletion */
+#define	EXT4_UNRM_FL			0x00000002 /* Undelete */
+#define	EXT4_COMPR_FL			0x00000004 /* Compress file */
+#define EXT4_SYNC_FL			0x00000008 /* Synchronous updates */
+#define EXT4_IMMUTABLE_FL		0x00000010 /* Immutable file */
+#define EXT4_APPEND_FL			0x00000020 /* writes to file may only append */
+#define EXT4_NODUMP_FL			0x00000040 /* do not dump file */
+#define EXT4_NOATIME_FL			0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT4_DIRTY_FL			0x00000100
+#define EXT4_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
+#define EXT4_NOCOMPR_FL			0x00000400 /* Don't compress */
+#define EXT4_ECOMPR_FL			0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT4_INDEX_FL			0x00001000 /* hash-indexed directory */
+#define EXT4_IMAGIC_FL			0x00002000 /* AFS directory */
+#define EXT4_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
+#define EXT4_NOTAIL_FL			0x00008000 /* file tail should not be merged */
+#define EXT4_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
+#define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
+#define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_EXT_MIGRATE		0x00100000 /* Inode is migrating */
+#define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
+
+#define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
+
+/*
+ * Inode dynamic state flags
+ */
+#define EXT4_STATE_JDATA		0x00000001 /* journaled data exists */
+#define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
+#define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
+#define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+	__u32 group;		/* Group number for this data */
+	__u64 block_bitmap;	/* Absolute block number of block bitmap */
+	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
+	__u64 inode_table;	/* Absolute block number of inode table start */
+	__u32 blocks_count;	/* Total number of blocks in this group */
+	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
+	__u16 unused;
+};
+
+/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+struct ext4_new_group_data {
+	__u32 group;
+	__u64 block_bitmap;
+	__u64 inode_bitmap;
+	__u64 inode_table;
+	__u32 blocks_count;
+	__u16 reserved_blocks;
+	__u16 unused;
+	__u32 free_blocks_count;
+};
+
+/*
+ * Following is used by preallocation code to tell get_blocks() that we
+ * want uninitialzed extents.
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT		2
+
+/*
+ * ioctl commands
+ */
+#define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
+#define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8,struct ext4_new_group_input)
+#define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
+#define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
+#ifdef CONFIG_JBD2_DEBUG
+#define EXT4_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
+#endif
+#define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT4_IOC_MIGRATE		_IO('f', 7)
+
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define EXT4_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define EXT4_IOC32_GETVERSION		_IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION		_IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#ifdef CONFIG_JBD2_DEBUG
+#define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
+#endif
+#define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+
+
+/*
+ *  Mount options
+ */
+struct ext4_mount_options {
+	unsigned long s_mount_opt;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned long s_commit_interval;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext4_inode {
+	__le16	i_mode;		/* File mode */
+	__le16	i_uid;		/* Low 16 bits of Owner Uid */
+	__le32	i_size_lo;	/* Size in bytes */
+	__le32	i_atime;	/* Access time */
+	__le32	i_ctime;	/* Inode Change time */
+	__le32	i_mtime;	/* Modification time */
+	__le32	i_dtime;	/* Deletion Time */
+	__le16	i_gid;		/* Low 16 bits of Group Id */
+	__le16	i_links_count;	/* Links count */
+	__le32	i_blocks_lo;	/* Blocks count */
+	__le32	i_flags;	/* File flags */
+	union {
+		struct {
+			__le32  l_i_version;
+		} linux1;
+		struct {
+			__u32  h_i_translator;
+		} hurd1;
+		struct {
+			__u32  m_i_reserved1;
+		} masix1;
+	} osd1;				/* OS dependent 1 */
+	__le32	i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
+	__le32	i_generation;	/* File version (for NFS) */
+	__le32	i_file_acl_lo;	/* File ACL */
+	__le32	i_size_high;
+	__le32	i_obso_faddr;	/* Obsoleted fragment address */
+	union {
+		struct {
+			__le16	l_i_blocks_high; /* were l_i_reserved1 */
+			__le16	l_i_file_acl_high;
+			__le16	l_i_uid_high;	/* these 2 fields */
+			__le16	l_i_gid_high;	/* were reserved2[0] */
+			__u32	l_i_reserved2;
+		} linux2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__u16	h_i_mode_high;
+			__u16	h_i_uid_high;
+			__u16	h_i_gid_high;
+			__u32	h_i_author;
+		} hurd2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__le16	m_i_file_acl_high;
+			__u32	m_i_reserved2[2];
+		} masix2;
+	} osd2;				/* OS dependent 2 */
+	__le16	i_extra_isize;
+	__le16	i_pad1;
+	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
+	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
+	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
+	__le32  i_crtime;       /* File Creation time */
+	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
+	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
+};
+
+
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
+	((offsetof(typeof(*ext4_inode), field) +	\
+	  sizeof((ext4_inode)->field))			\
+	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
+	    (einode)->i_extra_isize))			\
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+			   time->tv_sec >> 32 : 0) |
+			   ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+       if (sizeof(time->tv_sec) > 4)
+	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+			       << 32;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(inode)->xtime);       \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(einode)->xtime);      \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		ext4_decode_extra_time(&(inode)->xtime,			       \
+				       raw_inode->xtime ## _extra);	       \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(einode)->xtime.tv_sec = 				       \
+			(signed)le32_to_cpu((raw_inode)->xtime);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		ext4_decode_extra_time(&(einode)->xtime,		       \
+				       raw_inode->xtime ## _extra);	       \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
+#if defined(__KERNEL__) || defined(__linux__)
+#define i_reserved1	osd1.linux1.l_i_reserved1
+#define i_file_acl_high	osd2.linux2.l_i_file_acl_high
+#define i_blocks_high	osd2.linux2.l_i_blocks_high
+#define i_uid_low	i_uid
+#define i_gid_low	i_gid
+#define i_uid_high	osd2.linux2.l_i_uid_high
+#define i_gid_high	osd2.linux2.l_i_gid_high
+#define i_reserved2	osd2.linux2.l_i_reserved2
+
+#elif defined(__GNU__)
+
+#define i_translator	osd1.hurd1.h_i_translator
+#define i_uid_high	osd2.hurd2.h_i_uid_high
+#define i_gid_high	osd2.hurd2.h_i_gid_high
+#define i_author	osd2.hurd2.h_i_author
+
+#elif defined(__masix__)
+
+#define i_reserved1	osd1.masix1.m_i_reserved1
+#define i_file_acl_high	osd2.masix2.m_i_file_acl_high
+#define i_reserved2	osd2.masix2.m_i_reserved2
+
+#endif /* defined(__KERNEL__) || defined(__linux__) */
+
+/*
+ * File system states
+ */
+#define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */
+#define	EXT4_ERROR_FS			0x0002	/* Errors detected */
+#define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT4_MOUNT_CHECK		0x00001	/* Do mount-time checks */
+#define EXT4_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
+#define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
+#define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
+#define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
+#define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
+#define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
+#define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
+#define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
+#define EXT4_MOUNT_ABORT		0x00200	/* Fatal error detected */
+#define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
+#define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
+#define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
+#define EXT4_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
+#define EXT4_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
+#define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
+#define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
+#define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
+#define EXT4_MOUNT_RESERVATION		0x10000	/* Preallocation */
+#define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
+#define EXT4_MOUNT_NOBH			0x40000 /* No bufferheads */
+#define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
+#define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
+#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
+#ifndef _LINUX_EXT2_FS_H
+#define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
+#define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
+#define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
+					 EXT4_MOUNT_##opt)
+#else
+#define EXT2_MOUNT_NOLOAD		EXT4_MOUNT_NOLOAD
+#define EXT2_MOUNT_ABORT		EXT4_MOUNT_ABORT
+#define EXT2_MOUNT_DATA_FLAGS		EXT4_MOUNT_DATA_FLAGS
+#endif
+
+#define ext4_set_bit			ext2_set_bit
+#define ext4_set_bit_atomic		ext2_set_bit_atomic
+#define ext4_clear_bit			ext2_clear_bit
+#define ext4_clear_bit_atomic		ext2_clear_bit_atomic
+#define ext4_test_bit			ext2_test_bit
+#define ext4_find_first_zero_bit	ext2_find_first_zero_bit
+#define ext4_find_next_zero_bit		ext2_find_next_zero_bit
+#define ext4_find_next_bit		ext2_find_next_bit
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT4_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+#define EXT4_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT4_ERRORS_CONTINUE		1	/* Continue execution */
+#define EXT4_ERRORS_RO			2	/* Remount fs read-only */
+#define EXT4_ERRORS_PANIC		3	/* Panic */
+#define EXT4_ERRORS_DEFAULT		EXT4_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext4_super_block {
+/*00*/	__le32	s_inodes_count;		/* Inodes count */
+	__le32	s_blocks_count_lo;	/* Blocks count */
+	__le32	s_r_blocks_count_lo;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_lo;	/* Free blocks count */
+/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
+	__le32	s_first_data_block;	/* First Data Block */
+	__le32	s_log_block_size;	/* Block size */
+	__le32	s_obso_log_frag_size;	/* Obsoleted fragment size */
+/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
+	__le32	s_obso_frags_per_group;	/* Obsoleted fragments per group */
+	__le32	s_inodes_per_group;	/* # Inodes per group */
+	__le32	s_mtime;		/* Mount time */
+/*30*/	__le32	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_magic;		/* Magic signature */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le16	s_minor_rev_level;	/* minor revision level */
+/*40*/	__le32	s_lastcheck;		/* time of last check */
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le32	s_rev_level;		/* Revision level */
+/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	/*
+	 * These fields are for EXT4_DYNAMIC_REV superblocks only.
+	 *
+	 * Note: the difference between the compatible feature set and
+	 * the incompatible feature set is that if there is a bit set
+	 * in the incompatible feature set that the kernel doesn't
+	 * know about, it should refuse to mount the filesystem.
+	 *
+	 * e2fsck's requirements are more strict; if it doesn't know
+	 * about a feature in either the compatible or incompatible
+	 * feature set, it must abort and not try to meddle with
+	 * things it doesn't understand...
+	 */
+	__le32	s_first_ino;		/* First non-reserved inode */
+	__le16  s_inode_size;		/* size of inode structure */
+	__le16	s_block_group_nr;	/* block group # of this superblock */
+	__le32	s_feature_compat;	/* compatible feature set */
+/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
+	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
+/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*78*/	char	s_volume_name[16];	/* volume name */
+/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
+/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
+	/*
+	 * Performance hints.  Directory preallocation should only
+	 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+	 */
+	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
+	/*
+	 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
+	 */
+/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
+	__le32	s_journal_dev;		/* device number of journal file */
+	__le32	s_last_orphan;		/* start of list of inodes to delete */
+	__le32	s_hash_seed[4];		/* HTREE hash seed */
+	__u8	s_def_hash_version;	/* Default hash version to use */
+	__u8	s_reserved_char_pad;
+	__le16  s_desc_size;		/* size of group descriptor */
+/*100*/	__le32	s_default_mount_opts;
+	__le32	s_first_meta_bg;	/* First metablock block group */
+	__le32	s_mkfs_time;		/* When the filesystem was created */
+	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
+	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
+	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_hi;	/* Free blocks count */
+	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
+	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
+	__le32	s_flags;		/* Miscellaneous flags */
+	__le16  s_raid_stride;		/* RAID stride */
+	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+	__le64  s_mmp_block;            /* Block for multi-mount protection */
+	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+	__u32   s_reserved[163];        /* Padding to the end of the block */
+};
+
+#ifdef __KERNEL__
+static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
+{
+	return container_of(inode, struct ext4_inode_info, vfs_inode);
+}
+
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+
+static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+{
+	return ino == EXT4_ROOT_INO ||
+		ino == EXT4_JOURNAL_INO ||
+		ino == EXT4_RESIZE_INO ||
+		(ino >= EXT4_FIRST_INO(sb) &&
+		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+}
+#else
+/* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block.  This will allow us to call the feature-test
+ * macros from user land. */
+#define EXT4_SB(sb)	(sb)
+#endif
+
+#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT4_OS_LINUX		0
+#define EXT4_OS_HURD		1
+#define EXT4_OS_MASIX		2
+#define EXT4_OS_FREEBSD		3
+#define EXT4_OS_LITES		4
+
+/*
+ * Revision levels
+ */
+#define EXT4_GOOD_OLD_REV	0	/* The good old (original) format */
+#define EXT4_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
+
+#define EXT4_CURRENT_REV	EXT4_GOOD_OLD_REV
+#define EXT4_MAX_SUPP_REV	EXT4_DYNAMIC_REV
+
+#define EXT4_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
+	( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+#define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT4_SET_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT4_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+#define EXT4_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+#define EXT4_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+#define EXT4_FEATURE_COMPAT_EXT_ATTR		0x0008
+#define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
+#define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
+
+#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
+
+#define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
+#define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
+#define EXT4_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+#define EXT4_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
+#define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
+#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
+#define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
+
+#define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG| \
+					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
+					 EXT4_FEATURE_INCOMPAT_64BIT| \
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG)
+#define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
+					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	EXT4_DEF_RESUID		0
+#define	EXT4_DEF_RESGID		0
+
+/*
+ * Default mount options
+ */
+#define EXT4_DEFM_DEBUG		0x0001
+#define EXT4_DEFM_BSDGROUPS	0x0002
+#define EXT4_DEFM_XATTR_USER	0x0004
+#define EXT4_DEFM_ACL		0x0008
+#define EXT4_DEFM_UID16		0x0010
+#define EXT4_DEFM_JMODE		0x0060
+#define EXT4_DEFM_JMODE_DATA	0x0020
+#define EXT4_DEFM_JMODE_ORDERED	0x0040
+#define EXT4_DEFM_JMODE_WBACK	0x0060
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT4_NAME_LEN 255
+
+struct ext4_dir_entry {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__le16	name_len;		/* Name length */
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * The new version of the directory entry.  Since EXT4 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext4_dir_entry_2 {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * Ext4 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define EXT4_FT_UNKNOWN		0
+#define EXT4_FT_REG_FILE	1
+#define EXT4_FT_DIR		2
+#define EXT4_FT_CHRDEV		3
+#define EXT4_FT_BLKDEV		4
+#define EXT4_FT_FIFO		5
+#define EXT4_FT_SOCK		6
+#define EXT4_FT_SYMLINK		7
+
+#define EXT4_FT_MAX		8
+
+/*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT4_DIR_PAD			4
+#define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
+#define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
+					 ~EXT4_DIR_ROUND)
+#define EXT4_MAX_REC_LEN		((1<<16)-1)
+
+static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+	if (len == EXT4_MAX_REC_LEN)
+		return 1 << 16;
+	return len;
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len)
+{
+	if (len == (1 << 16))
+		return cpu_to_le16(EXT4_MAX_REC_LEN);
+	else if (len > (1 << 16))
+		BUG();
+	return cpu_to_le16(len);
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+		      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY		0
+#define DX_HASH_HALF_MD4	1
+#define DX_HASH_TEA		2
+
+#ifdef __KERNEL__
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+	u32		hash;
+	u32		minor_hash;
+	int		hash_version;
+	u32		*seed;
+};
+
+#define EXT4_HTREE_EOF	0x7fffffff
+
+/*
+ * Control parameters used by ext4_htree_next_block
+ */
+#define HASH_NB_ALWAYS		1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext4_iloc
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ext4_group_t block_group;
+};
+
+static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
+{
+	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories.  It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+	struct rb_root	root;
+	struct rb_node	*curr_node;
+	struct fname	*extra_fname;
+	loff_t		last_pos;
+	__u32		curr_hash;
+	__u32		curr_minor_hash;
+	__u32		next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext4_fsblk_t
+ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
+{
+	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR	-75000
+
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+			unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext4 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE	/**/
+# define ATTRIB_NORET	__attribute__((noreturn))
+# define NORET_AND	noreturn,
+
+/* balloc.c */
+extern unsigned int ext4_block_group(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
+extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
+			ext4_group_t group);
+extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, int *errp);
+extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+			ext4_fsblk_t block, unsigned long count, int metadata);
+extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
+				 ext4_fsblk_t block, unsigned long count,
+				unsigned long *pdquot_freed_blocks);
+extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
+extern void ext4_check_blocks_bitmap (struct super_block *);
+extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+						    ext4_group_t block_group,
+						    struct buffer_head ** bh);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+extern void ext4_init_block_alloc_info(struct inode *);
+extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
+
+/* dir.c */
+extern int ext4_check_dir_entry(const char *, struct inode *,
+				struct ext4_dir_entry_2 *,
+				struct buffer_head *, unsigned long);
+extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+				    __u32 minor_hash,
+				    struct ext4_dir_entry_2 *dirent);
+extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext4_sync_file (struct file *, struct dentry *, int);
+
+/* hash.c */
+extern int ext4fs_dirhash(const char *name, int len, struct
+			  dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
+extern void ext4_free_inode (handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes (struct super_block *);
+extern unsigned long ext4_count_dirs (struct super_block *);
+extern void ext4_check_inodes_bitmap (struct super_block *);
+extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
+
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+				struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern int __init init_ext4_mballoc(void);
+extern void exit_ext4_mballoc(void);
+extern void ext4_mb_free_blocks(handle_t *, struct inode *,
+		unsigned long, unsigned long, int, unsigned long *);
+
+
+/* inode.c */
+int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
+		struct buffer_head *bh, ext4_fsblk_t blocknr);
+struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+						ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *,
+						ext4_lblk_t, int, int *);
+int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, unsigned long maxblocks,
+				struct buffer_head *bh_result,
+				int create, int extend_disksize);
+
+extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern int  ext4_write_inode (struct inode *, int);
+extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern void ext4_delete_inode (struct inode *);
+extern int  ext4_sync_inode (handle_t *, struct inode *);
+extern void ext4_discard_reservation (struct inode *);
+extern void ext4_dirty_inode(struct inode *);
+extern int ext4_change_inode_journal_flag(struct inode *, int);
+extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern void ext4_truncate (struct inode *);
+extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+		struct address_space *mapping, loff_t from);
+
+/* ioctl.c */
+extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
+
+/* migrate.c */
+extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
+		       unsigned long);
+/* namei.c */
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+				__u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext4_group_add(struct super_block *sb,
+				struct ext4_new_group_data *input);
+extern int ext4_group_extend(struct super_block *sb,
+				struct ext4_super_block *es,
+				ext4_fsblk_t n_blocks_count);
+
+/* super.c */
+extern void ext4_error (struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void __ext4_std_error (struct super_block *, const char *, int);
+extern void ext4_abort (struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_warning (struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_update_dynamic_rev (struct super_block *sb);
+extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
+					__u32 compat);
+extern int ext4_update_rocompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 rocompat);
+extern int ext4_update_incompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 incompat);
+extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+				     struct ext4_group_desc *bg);
+extern void ext4_block_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_table_set(struct super_block *sb,
+				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+
+static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_r_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_free_blocks_count_lo);
+}
+
+static inline void ext4_blocks_count_set(struct ext4_super_block *es,
+					 ext4_fsblk_t blk)
+{
+	es->s_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
+					      ext4_fsblk_t blk)
+{
+	es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+					   ext4_fsblk_t blk)
+{
+	es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+{
+	return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+		le32_to_cpu(raw_inode->i_size_lo);
+}
+
+static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+{
+	raw_inode->i_size_lo = cpu_to_le32(i_size);
+	raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
+}
+
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+							ext4_group_t group)
+{
+	 struct ext4_group_info ***grp_info;
+	 long indexv, indexh;
+	 grp_info = EXT4_SB(sb)->s_group_info;
+	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+	 return grp_info[indexv][indexh];
+}
+
+
+#define ext4_std_error(sb, errno)				\
+do {								\
+	if ((errno))						\
+		__ext4_std_error((sb), __FUNCTION__, (errno));	\
+} while (0)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext4_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext4_file_inode_operations;
+extern const struct file_operations ext4_file_operations;
+
+/* namei.c */
+extern const struct inode_operations ext4_dir_inode_operations;
+extern const struct inode_operations ext4_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations ext4_symlink_inode_operations;
+extern const struct inode_operations ext4_fast_symlink_inode_operations;
+
+/* extents.c */
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+			ext4_lblk_t iblock,
+			unsigned long max_blocks, struct buffer_head *bh_result,
+			int create, int extend_disksize);
+extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+			  loff_t len);
+extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
+			sector_t block, unsigned long max_blocks,
+			struct buffer_head *bh, int create,
+			int extend_disksize);
+#endif	/* __KERNEL__ */
+
+#endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
new file mode 100644
index 00000000000..75333b595fa
--- /dev/null
+++ b/fs/ext4/ext4_extents.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+#ifndef _EXT4_EXTENTS
+#define _EXT4_EXTENTS
+
+#include "ext4.h"
+
+/*
+ * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
+ * becomes very small, so index split, in-depth growing and
+ * other hard changes happen much more often.
+ * This is for debug purposes only.
+ */
+#define AGGRESSIVE_TEST_
+
+/*
+ * With EXTENTS_STATS defined, the number of blocks and extents
+ * are collected in the truncate path. They'll be shown at
+ * umount time.
+ */
+#define EXTENTS_STATS__
+
+/*
+ * If CHECK_BINSEARCH is defined, then the results of the binary search
+ * will also be checked by linear search.
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * If EXT_DEBUG is defined you can use the 'extdebug' mount option
+ * to get lots of info about what's going on.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(a...)		printk(a)
+#else
+#define ext_debug(a...)
+#endif
+
+/*
+ * If EXT_STATS is defined then stats numbers are collected.
+ * These number will be displayed at umount time.
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total).
+ * The first 12 bytes store ext4_extent_header;
+ * the remainder stores an array of ext4_extent.
+ */
+
+/*
+ * This is the extent on-disk structure.
+ * It's used at the bottom of the tree.
+ */
+struct ext4_extent {
+	__le32	ee_block;	/* first logical block extent covers */
+	__le16	ee_len;		/* number of blocks covered by extent */
+	__le16	ee_start_hi;	/* high 16 bits of physical block */
+	__le32	ee_start_lo;	/* low 32 bits of physical block */
+};
+
+/*
+ * This is index on-disk structure.
+ * It's used at all the levels except the bottom.
+ */
+struct ext4_extent_idx {
+	__le32	ei_block;	/* index covers logical blocks from 'block' */
+	__le32	ei_leaf_lo;	/* pointer to the physical block of the next *
+				 * level. leaf or next index could be there */
+	__le16	ei_leaf_hi;	/* high 16 bits of physical block */
+	__u16	ei_unused;
+};
+
+/*
+ * Each block (leaves and indexes), even inode-stored has header.
+ */
+struct ext4_extent_header {
+	__le16	eh_magic;	/* probably will support different formats */
+	__le16	eh_entries;	/* number of valid entries */
+	__le16	eh_max;		/* capacity of store in entries */
+	__le16	eh_depth;	/* has tree real underlying blocks? */
+	__le32	eh_generation;	/* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC		cpu_to_le16(0xf30a)
+
+/*
+ * Array of ext4_ext_path contains path to some extent.
+ * Creation/lookup routines use it for traversal/splitting/etc.
+ * Truncate uses it to simulate recursive walking.
+ */
+struct ext4_ext_path {
+	ext4_fsblk_t			p_block;
+	__u16				p_depth;
+	struct ext4_extent		*p_ext;
+	struct ext4_extent_idx		*p_idx;
+	struct ext4_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+#define EXT4_EXT_CACHE_NO	0
+#define EXT4_EXT_CACHE_GAP	1
+#define EXT4_EXT_CACHE_EXTENT	2
+
+
+#define EXT_MAX_BLOCK	0xffffffff
+
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an uninitialized (i.e.
+ * preallocated).
+ * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
+ * uninitialized extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * uninitialized one. In other words, if MSB of ee_len is set, it is an
+ * uninitialized extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an uninitialized extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN	(1UL << 15)
+#define EXT_UNINIT_MAX_LEN	(EXT_INIT_MAX_LEN - 1)
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext4_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext4_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	(le16_to_cpu((__path__)->p_hdr->eh_entries) \
+				     < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+	return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+	return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void ext4_ext_tree_changed(struct inode *inode)
+{
+	EXT4_I(inode)->i_ext_generation++;
+}
+
+static inline void
+ext4_ext_invalidate_cache(struct inode *inode)
+{
+	EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+}
+
+static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+{
+	/* We can not have an uninitialized extent of zero length! */
+	BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+	ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+{
+	/* Extent with ee_len of 0x8000 is treated as an initialized extent */
+	return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+	return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+		le16_to_cpu(ext->ee_len) :
+		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+}
+
+extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
+extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_try_to_merge(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *);
+extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+							struct ext4_ext_path *);
+extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
+						ext4_lblk_t *, ext4_fsblk_t *);
+extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
+						ext4_lblk_t *, ext4_fsblk_t *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+#endif /* _EXT4_EXTENTS */
+
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
new file mode 100644
index 00000000000..26a4ae255d7
--- /dev/null
+++ b/fs/ext4/ext4_i.h
@@ -0,0 +1,167 @@
+/*
+ *  ext4_i.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs_i.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _EXT4_I
+#define _EXT4_I
+
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned long ext4_group_t;
+
+struct ext4_reserve_window {
+	ext4_fsblk_t	_rsv_start;	/* First byte reserved */
+	ext4_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
+};
+
+struct ext4_reserve_window_node {
+	struct rb_node		rsv_node;
+	__u32			rsv_goal_size;
+	__u32			rsv_alloc_hit;
+	struct ext4_reserve_window	rsv_window;
+};
+
+struct ext4_block_alloc_info {
+	/* information about reservation window */
+	struct ext4_reserve_window_node rsv_window_node;
+	/*
+	 * was i_next_alloc_block in ext4_inode_info
+	 * is the logical (file-relative) number of the
+	 * most-recently-allocated block in this file.
+	 * We use this for detecting linearly ascending allocation requests.
+	 */
+	ext4_lblk_t last_alloc_logical_block;
+	/*
+	 * Was i_next_alloc_goal in ext4_inode_info
+	 * is the *physical* companion to i_next_alloc_block.
+	 * it the physical block number of the block which was most-recentl
+	 * allocated to this file.  This give us the goal (target) for the next
+	 * allocation when we detect linearly ascending requests.
+	 */
+	ext4_fsblk_t last_alloc_physical_block;
+};
+
+#define rsv_start rsv_window._rsv_start
+#define rsv_end rsv_window._rsv_end
+
+/*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+	ext4_fsblk_t	ec_start;
+	ext4_lblk_t	ec_block;
+	__u32		ec_len; /* must be 32bit to return holes */
+	__u32		ec_type;
+};
+
+/*
+ * third extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_flags;
+	ext4_fsblk_t	i_file_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	__u32	i_state;		/* Dynamic state flags for ext4 */
+
+	/* block reservation info */
+	struct ext4_block_alloc_info *i_block_alloc_info;
+
+	ext4_lblk_t		i_dir_start_lookup;
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
+#endif
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+
+	unsigned long i_ext_generation;
+	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+};
+
+#endif	/* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d6afe4e2734..c75384b34f2 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -2,14 +2,14 @@
  * Interface between ext4 and JBD
  */
 
-#include <linux/ext4_jbd2.h>
+#include "ext4_jbd2.h"
 
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
 	int err = jbd2_journal_get_undo_access(handle, bh);
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
 	return err;
 }
 
@@ -18,7 +18,7 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 {
 	int err = jbd2_journal_get_write_access(handle, bh);
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
 	return err;
 }
 
@@ -27,7 +27,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
 {
 	int err = jbd2_journal_forget(handle, bh);
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
 	return err;
 }
 
@@ -36,7 +36,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 {
 	int err = jbd2_journal_revoke(handle, blocknr, bh);
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
 	return err;
 }
 
@@ -45,7 +45,7 @@ int __ext4_journal_get_create_access(const char *where,
 {
 	int err = jbd2_journal_get_create_access(handle, bh);
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
 	return err;
 }
 
@@ -54,6 +54,6 @@ int __ext4_journal_dirty_metadata(const char *where,
 {
 	int err = jbd2_journal_dirty_metadata(handle, bh);
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		ext4_journal_abort_handle(where, __func__, bh, handle, err);
 	return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
new file mode 100644
index 00000000000..9255a7d28b2
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.h
@@ -0,0 +1,231 @@
+/*
+ * ext4_jbd2.h
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext4-specific journaling extensions.
+ */
+
+#ifndef _EXT4_JBD2_H
+#define _EXT4_JBD2_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#define EXT4_JOURNAL(inode)	(EXT4_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify up to
+ * 5 levels of tree + root which are stored in the inode. */
+
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)	\
+		|| test_opt(sb, EXTENTS) ? 27U : 8U)
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT4_XATTR_TRANS_BLOCKS		6U
+
+/* Define the minimum size for a transaction which modifies data.  This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota).  The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
+					 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
+ * generous.  We can grow the delete transaction later if necessary. */
+
+#define EXT4_DELETE_TRANS_BLOCKS(sb)	(2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT4_MAX_TRANS_DATA		64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one.  Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates.  Quota allocations are not
+ * needed. */
+
+#define EXT4_RESERVE_TRANS_BLOCKS	12U
+
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS	8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only inode+data */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+
+int
+ext4_mark_iloc_dirty(handle_t *handle,
+		     struct inode *inode,
+		     struct ext4_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			struct ext4_iloc *iloc);
+
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext4 calls into JBD.  The intent here is
+ * to allow these to be turned into appropriate stubs so ext4 can control
+ * ext2 filesystems, so ext2+ext4 systems only nee one fs.  This work hasn't
+ * been done yet.
+ */
+
+static inline void ext4_journal_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	jbd2_journal_release_buffer(handle, bh);
+}
+
+void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
+
+int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+				struct buffer_head *bh);
+
+int __ext4_journal_forget(const char *where, handle_t *handle,
+				struct buffer_head *bh);
+
+int __ext4_journal_revoke(const char *where, handle_t *handle,
+				ext4_fsblk_t blocknr, struct buffer_head *bh);
+
+int __ext4_journal_get_create_access(const char *where,
+				handle_t *handle, struct buffer_head *bh);
+
+int __ext4_journal_dirty_metadata(const char *where,
+				handle_t *handle, struct buffer_head *bh);
+
+#define ext4_journal_get_undo_access(handle, bh) \
+	__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+#define ext4_journal_get_write_access(handle, bh) \
+	__ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+#define ext4_journal_revoke(handle, blocknr, bh) \
+	__ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+#define ext4_journal_get_create_access(handle, bh) \
+	__ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+#define ext4_journal_dirty_metadata(handle, bh) \
+	__ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+#define ext4_journal_forget(handle, bh) \
+	__ext4_journal_forget(__FUNCTION__, (handle), (bh))
+
+int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext4_journal_stop(const char *where, handle_t *handle);
+
+static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+{
+	return ext4_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext4_journal_stop(handle) \
+	__ext4_journal_stop(__FUNCTION__, (handle))
+
+static inline handle_t *ext4_journal_current_handle(void)
+{
+	return journal_current_handle();
+}
+
+static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+{
+	return jbd2_journal_extend(handle, nblocks);
+}
+
+static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+{
+	return jbd2_journal_restart(handle, nblocks);
+}
+
+static inline int ext4_journal_blocks_per_page(struct inode *inode)
+{
+	return jbd2_journal_blocks_per_page(inode);
+}
+
+static inline int ext4_journal_force_commit(journal_t *journal)
+{
+	return jbd2_journal_force_commit(journal);
+}
+
+/* super.c */
+int ext4_force_commit(struct super_block *sb);
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 1;
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		return 1;
+	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+		return 1;
+	return 0;
+}
+
+static inline int ext4_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+		return 0;
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		return 1;
+	return 0;
+}
+
+static inline int ext4_should_writeback_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+		return 0;
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		return 1;
+	return 0;
+}
+
+#endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
new file mode 100644
index 00000000000..5802e69f219
--- /dev/null
+++ b/fs/ext4/ext4_sb.h
@@ -0,0 +1,148 @@
+/*
+ *  ext4_sb.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs_sb.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _EXT4_SB
+#define _EXT4_SB
+
+#ifdef __KERNEL__
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#endif
+#include <linux/rbtree.h>
+
+/*
+ * third extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block * s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head ** s_group_desc;
+	unsigned long  s_mount_opt;
+	ext4_fsblk_t s_sb_block;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct blockgroup_lock s_blockgroup_lock;
+
+	/* root of the per fs reservation window tree */
+	spinlock_t s_rsv_window_lock;
+	struct rb_root s_rsv_window_root;
+	struct ext4_reserve_window_node s_rsv_window_head;
+
+	/* Journaling */
+	struct inode * s_journal_inode;
+	struct journal_s * s_journal;
+	struct list_head s_orphan;
+	unsigned long s_commit_interval;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
+	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	long s_blocks_reserved;
+	spinlock_t s_reserve_lock;
+	struct list_head s_active_transaction;
+	struct list_head s_closed_transaction;
+	struct list_head s_committed_transaction;
+	spinlock_t s_md_lock;
+	tid_t s_last_transaction;
+	unsigned short *s_mb_offsets, *s_mb_maxs;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned long s_mb_stream_request;
+	unsigned long s_mb_max_to_scan;
+	unsigned long s_mb_min_to_scan;
+	unsigned long s_mb_stats;
+	unsigned long s_mb_order2_reqs;
+	unsigned long s_mb_group_prealloc;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* history to debug policy */
+	struct ext4_mb_history *s_mb_history;
+	int s_mb_history_cur;
+	int s_mb_history_max;
+	int s_mb_history_num;
+	struct proc_dir_entry *s_mb_proc;
+	spinlock_t s_mb_history_lock;
+	int s_mb_history_filter;
+
+	/* stats for buddy allocator */
+	spinlock_t s_mb_pa_lock;
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+
+	/* locality groups */
+	struct ext4_locality_group *s_locality_groups;
+};
+
+#endif	/* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9ae6e67090c..47929c4e3da 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -32,7 +32,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -40,8 +39,9 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/falloc.h>
-#include <linux/ext4_fs_extents.h>
 #include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
 
 
 /*
@@ -308,7 +308,7 @@ corrupted:
 }
 
 #define ext4_ext_check_header(inode, eh, depth)	\
-	__ext4_ext_check_header(__FUNCTION__, inode, eh, depth)
+	__ext4_ext_check_header(__func__, inode, eh, depth)
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -614,7 +614,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 
 	ix->ei_block = cpu_to_le32(logical);
 	ext4_idx_store_pblock(ix, ptr);
-	curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
+	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 
 	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
 			     > le16_to_cpu(curp->p_hdr->eh_max));
@@ -736,7 +736,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	}
 	if (m) {
 		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
-		neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
+		le16_add_cpu(&neh->eh_entries, m);
 	}
 
 	set_buffer_uptodate(bh);
@@ -753,8 +753,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			goto cleanup;
-		path[depth].p_hdr->eh_entries =
-		     cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
+		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		if (err)
 			goto cleanup;
@@ -817,8 +816,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		if (m) {
 			memmove(++fidx, path[i].p_idx - m,
 				sizeof(struct ext4_extent_idx) * m);
-			neh->eh_entries =
-				cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
+			le16_add_cpu(&neh->eh_entries, m);
 		}
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
@@ -834,7 +832,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			err = ext4_ext_get_access(handle, inode, path + i);
 			if (err)
 				goto cleanup;
-			path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
+			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
 			err = ext4_ext_dirty(handle, inode, path + i);
 			if (err)
 				goto cleanup;
@@ -1369,7 +1367,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
 				* sizeof(struct ext4_extent);
 			memmove(ex + 1, ex + 2, len);
 		}
-		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
+		le16_add_cpu(&eh->eh_entries, -1);
 		merge_done = 1;
 		WARN_ON(eh->eh_entries == 0);
 		if (!eh->eh_entries)
@@ -1560,7 +1558,7 @@ has_space:
 		path[depth].p_ext = nearex;
 	}
 
-	eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
+	le16_add_cpu(&eh->eh_entries, 1);
 	nearex = path[depth].p_ext;
 	nearex->ee_block = newext->ee_block;
 	ext4_ext_store_pblock(nearex, ext_pblock(newext));
@@ -1699,7 +1697,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	err = ext4_ext_get_access(handle, inode, path);
 	if (err)
 		return err;
-	path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
+	le16_add_cpu(&path->p_hdr->eh_entries, -1);
 	err = ext4_ext_dirty(handle, inode, path);
 	if (err)
 		return err;
@@ -1902,7 +1900,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		if (num == 0) {
 			/* this extent is removed; mark slot entirely unused */
 			ext4_ext_store_pblock(ex, 0);
-			eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+			le16_add_cpu(&eh->eh_entries, -1);
 		}
 
 		ex->ee_block = cpu_to_le32(block);
@@ -1979,7 +1977,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
-	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
+	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
 	if (path == NULL) {
 		ext4_journal_stop(handle);
 		return -ENOMEM;
@@ -2138,6 +2136,82 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+static void bi_complete(struct bio *bio, int error)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+/* FIXME!! we need to try to merge to left or right after zero-out  */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	int ret = -EIO;
+	struct bio *bio;
+	int blkbits, blocksize;
+	sector_t ee_pblock;
+	struct completion event;
+	unsigned int ee_len, len, done, offset;
+
+
+	blkbits   = inode->i_blkbits;
+	blocksize = inode->i_sb->s_blocksize;
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext_pblock(ex);
+
+	/* convert ee_pblock to 512 byte sectors */
+	ee_pblock = ee_pblock << (blkbits - 9);
+
+	while (ee_len > 0) {
+
+		if (ee_len > BIO_MAX_PAGES)
+			len = BIO_MAX_PAGES;
+		else
+			len = ee_len;
+
+		bio = bio_alloc(GFP_NOIO, len);
+		if (!bio)
+			return -ENOMEM;
+		bio->bi_sector = ee_pblock;
+		bio->bi_bdev   = inode->i_sb->s_bdev;
+
+		done = 0;
+		offset = 0;
+		while (done < len) {
+			ret = bio_add_page(bio, ZERO_PAGE(0),
+							blocksize, offset);
+			if (ret != blocksize) {
+				/*
+				 * We can't add any more pages because of
+				 * hardware limitations.  Start a new bio.
+				 */
+				break;
+			}
+			done++;
+			offset += blocksize;
+			if (offset >= PAGE_CACHE_SIZE)
+				offset = 0;
+		}
+
+		init_completion(&event);
+		bio->bi_private = &event;
+		bio->bi_end_io = bi_complete;
+		submit_bio(WRITE, bio);
+		wait_for_completion(&event);
+
+		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+			ret = 0;
+		else {
+			ret = -EIO;
+			break;
+		}
+		bio_put(bio);
+		ee_len    -= done;
+		ee_pblock += done  << (blkbits - 9);
+	}
+	return ret;
+}
+
+#define EXT4_EXT_ZERO_LEN 7
+
 /*
  * This function is called by ext4_ext_get_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2154,7 +2228,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 						ext4_lblk_t iblock,
 						unsigned long max_blocks)
 {
-	struct ext4_extent *ex, newex;
+	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex1 = NULL;
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
@@ -2173,10 +2247,26 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	allocated = ee_len - (iblock - ee_block);
 	newblock = iblock - ee_block + ext_pblock(ex);
 	ex2 = ex;
+	orig_ex.ee_block = ex->ee_block;
+	orig_ex.ee_len   = cpu_to_le16(ee_len);
+	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
+	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = orig_ex.ee_block;
+		ex->ee_len   = orig_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		/* zeroed the full extent */
+		return allocated;
+	}
 
 	/* ex1: ee_block to iblock - 1 : uninitialized */
 	if (iblock > ee_block) {
@@ -2195,19 +2285,103 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	/* ex3: to ee_block + ee_len : uninitialised */
 	if (allocated > max_blocks) {
 		unsigned int newdepth;
+		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
+		if (allocated <= EXT4_EXT_ZERO_LEN) {
+			/* Mark first half uninitialized.
+			 * Mark second half initialized and zero out the
+			 * initialized extent
+			 */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = cpu_to_le16(ee_len - allocated);
+			ext4_ext_mark_uninitialized(ex);
+			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+
+			ex3 = &newex;
+			ex3->ee_block = cpu_to_le32(iblock);
+			ext4_ext_store_pblock(ex3, newblock);
+			ex3->ee_len = cpu_to_le16(allocated);
+			err = ext4_ext_insert_extent(handle, inode, path, ex3);
+			if (err == -ENOSPC) {
+				err =  ext4_ext_zeroout(inode, &orig_ex);
+				if (err)
+					goto fix_extent_len;
+				ex->ee_block = orig_ex.ee_block;
+				ex->ee_len   = orig_ex.ee_len;
+				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+				ext4_ext_dirty(handle, inode, path + depth);
+				/* zeroed the full extent */
+				return allocated;
+
+			} else if (err)
+				goto fix_extent_len;
+
+			/*
+			 * We need to zero out the second half because
+			 * an fallocate request can update file size and
+			 * converting the second half to initialized extent
+			 * implies that we can leak some junk data to user
+			 * space.
+			 */
+			err =  ext4_ext_zeroout(inode, ex3);
+			if (err) {
+				/*
+				 * We should actually mark the
+				 * second half as uninit and return error
+				 * Insert would have changed the extent
+				 */
+				depth = ext_depth(inode);
+				ext4_ext_drop_refs(path);
+				path = ext4_ext_find_extent(inode,
+								iblock, path);
+				if (IS_ERR(path)) {
+					err = PTR_ERR(path);
+					return err;
+				}
+				ex = path[depth].p_ext;
+				err = ext4_ext_get_access(handle, inode,
+								path + depth);
+				if (err)
+					return err;
+				ext4_ext_mark_uninitialized(ex);
+				ext4_ext_dirty(handle, inode, path + depth);
+				return err;
+			}
+
+			/* zeroed the second half */
+			return allocated;
+		}
 		ex3 = &newex;
 		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
 		ext4_ext_store_pblock(ex3, newblock + max_blocks);
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3);
-		if (err)
-			goto out;
+		if (err == -ENOSPC) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = orig_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			/* zeroed the full extent */
+			return allocated;
+
+		} else if (err)
+			goto fix_extent_len;
 		/*
 		 * The depth, and hence eh & ex might change
 		 * as part of the insert above.
 		 */
 		newdepth = ext_depth(inode);
+		/*
+		 * update the extent length after successfull insert of the
+		 * split extent
+		 */
+		orig_ex.ee_len = cpu_to_le16(ee_len -
+						ext4_ext_get_actual_len(ex3));
 		if (newdepth != depth) {
 			depth = newdepth;
 			ext4_ext_drop_refs(path);
@@ -2226,6 +2400,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				goto out;
 		}
 		allocated = max_blocks;
+
+		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
+		 * to insert a extent in the middle zerout directly
+		 * otherwise give the extent a chance to merge to left
+		 */
+		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
+							iblock != ee_block) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = orig_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			/* zero out the first half */
+			return allocated;
+		}
 	}
 	/*
 	 * If there was a change of depth as part of the
@@ -2282,8 +2474,29 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+	if (err == -ENOSPC) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = orig_ex.ee_block;
+		ex->ee_len   = orig_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		/* zero out the first half */
+		return allocated;
+	} else if (err)
+		goto fix_extent_len;
 out:
 	return err ? err : allocated;
+
+fix_extent_len:
+	ex->ee_block = orig_ex.ee_block;
+	ex->ee_len   = orig_ex.ee_len;
+	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_mark_uninitialized(ex);
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
 }
 
 /*
@@ -2393,8 +2606,20 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			}
 			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
 				goto out;
-			if (!create)
+			if (!create) {
+				/*
+				 * We have blocks reserved already.  We
+				 * return allocated blocks so that delalloc
+				 * won't do block reservation for us.  But
+				 * the buffer head will be unmapped so that
+				 * a read from the block returns 0s.
+				 */
+				if (allocated > max_blocks)
+					allocated = max_blocks;
+				/* mark the buffer unwritten */
+				__set_bit(BH_Unwritten, &bh_result->b_state);
 				goto out2;
+			}
 
 			ret = ext4_ext_convert_to_initialized(handle, inode,
 								path, iblock,
@@ -2584,6 +2809,8 @@ out_stop:
 		ext4_orphan_del(handle, inode);
 
 	up_write(&EXT4_I(inode)->i_data_sem);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 }
 
@@ -2608,6 +2835,28 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
 	return needed;
 }
 
+static void ext4_falloc_update_inode(struct inode *inode,
+				int mode, loff_t new_size, int update_ctime)
+{
+	struct timespec now;
+
+	if (update_ctime) {
+		now = current_fs_time(inode->i_sb);
+		if (!timespec_equal(&inode->i_ctime, &now))
+			inode->i_ctime = now;
+	}
+	/*
+	 * Update only when preallocation was requested beyond
+	 * the file size.
+	 */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+				new_size > i_size_read(inode)) {
+		i_size_write(inode, new_size);
+		EXT4_I(inode)->i_disksize = new_size;
+	}
+
+}
+
 /*
  * preallocate space for a file. This implements ext4's fallocate inode
  * operation, which gets called from sys_fallocate system call.
@@ -2619,8 +2868,8 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
 	handle_t *handle;
 	ext4_lblk_t block;
+	loff_t new_size;
 	unsigned long max_blocks;
-	ext4_fsblk_t nblocks = 0;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
@@ -2639,9 +2888,12 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 		return -ENODEV;
 
 	block = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-			- block;
-
+							- block;
 	/*
 	 * credits to insert 1 extent into extent tree + buffers to be able to
 	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
@@ -2657,7 +2909,6 @@ retry:
 			ret = PTR_ERR(handle);
 			break;
 		}
-
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
 					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
@@ -2673,61 +2924,24 @@ retry:
 			ret2 = ext4_journal_stop(handle);
 			break;
 		}
-		if (ret > 0) {
-			/* check wrap through sign-bit/zero here */
-			if ((block + ret) < 0 || (block + ret) < block) {
-				ret = -EIO;
-				ext4_mark_inode_dirty(handle, inode);
-				ret2 = ext4_journal_stop(handle);
-				break;
-			}
-			if (buffer_new(&map_bh) && ((block + ret) >
-			    (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
-			    >> blkbits)))
-					nblocks = nblocks + ret;
-		}
-
-		/* Update ctime if new blocks get allocated */
-		if (nblocks) {
-			struct timespec now;
-
-			now = current_fs_time(inode->i_sb);
-			if (!timespec_equal(&inode->i_ctime, &now))
-				inode->i_ctime = now;
-		}
+		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+						blkbits) >> blkbits))
+			new_size = offset + len;
+		else
+			new_size = (block + ret) << blkbits;
 
+		ext4_falloc_update_inode(inode, mode, new_size,
+						buffer_new(&map_bh));
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
 		if (ret2)
 			break;
 	}
-
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+	if (ret == -ENOSPC &&
+			ext4_should_retry_alloc(inode->i_sb, &retries)) {
+		ret = 0;
 		goto retry;
-
-	/*
-	 * Time to update the file size.
-	 * Update only when preallocation was requested beyond the file size.
-	 */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    (offset + len) > i_size_read(inode)) {
-		if (ret > 0) {
-			/*
-			 * if no error, we assume preallocation succeeded
-			 * completely
-			 */
-			i_size_write(inode, offset + len);
-			EXT4_I(inode)->i_disksize = i_size_read(inode);
-		} else if (ret < 0 && nblocks) {
-			/* Handle partial allocation scenario */
-			loff_t newsize;
-
-			newsize  = (nblocks << blkbits) + i_size_read(inode);
-			i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
-			EXT4_I(inode)->i_disksize = i_size_read(inode);
-		}
 	}
-
 	mutex_unlock(&inode->i_mutex);
 	return ret > 0 ? ret2 : ret;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac35ec58db5..4159be6366a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,8 +21,8 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -129,7 +129,7 @@ const struct file_operations ext4_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext4_file_write,
-	.ioctl		= ext4_ioctl,
+	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8d50879d1c2..1c8ba48d4f8 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,8 +27,8 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
 
 /*
  * akpm: A new design for ext4_sync_file().
@@ -72,6 +72,9 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		goto out;
+
 	/*
 	 * The VFS has written the file data.  If the inode is unaltered
 	 * then we need not start a commit.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1555024e3b3..1d6329dbe39 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -11,8 +11,8 @@
 
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/cryptohash.h>
+#include "ext4.h"
 
 #define DELTA 0x9E3779B9
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 486e46a3918..c6efbab0c80 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -15,8 +15,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/quotaops.h>
@@ -25,7 +23,8 @@
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <asm/byteorder.h>
-
+#include "ext4.h"
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "group.h"
@@ -75,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
+		ext4_error(sb, __func__, "Checksum bad for group %lu\n",
 			   block_group);
 		gdp->bg_free_blocks_count = 0;
 		gdp->bg_free_inodes_count = 0;
@@ -223,11 +222,9 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 
 		if (gdp) {
 			spin_lock(sb_bgl_lock(sbi, block_group));
-			gdp->bg_free_inodes_count = cpu_to_le16(
-				le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+			le16_add_cpu(&gdp->bg_free_inodes_count, 1);
 			if (is_directory)
-				gdp->bg_used_dirs_count = cpu_to_le16(
-				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+				le16_add_cpu(&gdp->bg_used_dirs_count, -1);
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -588,7 +585,7 @@ got:
 	ino++;
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 	    ino > EXT4_INODES_PER_GROUP(sb)) {
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			   "reserved inode or inode > inodes count - "
 			   "block_group = %lu, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -664,11 +661,9 @@ got:
 				cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
 	}
 
-	gdp->bg_free_inodes_count =
-		cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
 	if (S_ISDIR(mode)) {
-		gdp->bg_used_dirs_count =
-			cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group));
@@ -744,23 +739,24 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	err = ext4_mark_inode_dirty(handle, inode);
-	if (err) {
-		ext4_std_error(sb, err);
-		goto fail_free_drop;
-	}
 	if (test_opt(sb, EXTENTS)) {
-		/* set extent flag only for directory and file */
-		if (S_ISDIR(mode) || S_ISREG(mode)) {
+		/* set extent flag only for diretory, file and normal symlink*/
+		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
 			ext4_ext_tree_init(handle, inode);
 			err = ext4_update_incompat_feature(handle, sb,
 					EXT4_FEATURE_INCOMPAT_EXTENTS);
 			if (err)
-				goto fail;
+				goto fail_free_drop;
 		}
 	}
 
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto fail_free_drop;
+	}
+
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
 	goto really_out;
 fail:
@@ -796,7 +792,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "bad orphan ino %lu!  e2fsck was run?", ino);
 		goto error;
 	}
@@ -805,7 +801,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 	bitmap_bh = read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "inode bitmap error for orphan %lu", ino);
 		goto error;
 	}
@@ -830,7 +826,7 @@ iget_failed:
 	err = PTR_ERR(inode);
 	inode = NULL;
 bad_orphan:
-	ext4_warning(sb, __FUNCTION__,
+	ext4_warning(sb, __func__,
 		     "bad orphan inode %lu!  e2fsck was run?", ino);
 	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
 	       bit, (unsigned long long)bitmap_bh->b_blocknr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8fab233cb05..8d970774641 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -25,7 +25,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -36,6 +35,7 @@
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -93,7 +93,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 	BUFFER_TRACE(bh, "call ext4_journal_revoke");
 	err = ext4_journal_revoke(handle, blocknr, bh);
 	if (err)
-		ext4_abort(inode->i_sb, __FUNCTION__,
+		ext4_abort(inode->i_sb, __func__,
 			   "error %d when attempting revoke", err);
 	BUFFER_TRACE(bh, "exit");
 	return err;
@@ -985,6 +985,16 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	} else {
 		retval = ext4_get_blocks_handle(handle, inode, block,
 				max_blocks, bh, create, extend_disksize);
+
+		if (retval > 0 && buffer_new(bh)) {
+			/*
+			 * We allocated new blocks which will result in
+			 * i_data's format changing.  Force the migrate
+			 * to fail by clearing migrate flags
+			 */
+			EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+							~EXT4_EXT_MIGRATE;
+		}
 	}
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
@@ -1230,7 +1240,7 @@ int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 {
 	int err = jbd2_journal_dirty_data(handle, bh);
 	if (err)
-		ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
+		ext4_journal_abort_handle(__func__, __func__,
 						bh, handle, err);
 	return err;
 }
@@ -1301,10 +1311,11 @@ static int ext4_ordered_write_end(struct file *file,
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		copied = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-		if (copied < 0)
-			ret = copied;
+		copied = ret2;
+		if (ret2 < 0)
+			ret = ret2;
 	}
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
@@ -1329,10 +1340,11 @@ static int ext4_writeback_write_end(struct file *file,
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
 
-	copied = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
-	if (copied < 0)
-		ret = copied;
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
 
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
@@ -2501,12 +2513,10 @@ out_stop:
 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext4_iloc *iloc)
 {
-	unsigned long desc, group_desc;
 	ext4_group_t block_group;
 	unsigned long offset;
 	ext4_fsblk_t block;
-	struct buffer_head *bh;
-	struct ext4_group_desc * gdp;
+	struct ext4_group_desc *gdp;
 
 	if (!ext4_valid_inum(sb, ino)) {
 		/*
@@ -2518,22 +2528,10 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 	}
 
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-	if (block_group >= EXT4_SB(sb)->s_groups_count) {
-		ext4_error(sb,"ext4_get_inode_block","group >= groups count");
+	gdp = ext4_get_group_desc(sb, block_group, NULL);
+	if (!gdp)
 		return 0;
-	}
-	smp_rmb();
-	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
-	desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-	bh = EXT4_SB(sb)->s_group_desc[group_desc];
-	if (!bh) {
-		ext4_error (sb, "ext4_get_inode_block",
-			    "Descriptor not loaded");
-		return 0;
-	}
 
-	gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
-		desc * EXT4_DESC_SIZE(sb));
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
@@ -2976,7 +2974,8 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+	/* clear the migrate flag in the raw_inode */
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
@@ -3374,7 +3373,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
-					ext4_warning(inode->i_sb, __FUNCTION__,
+					ext4_warning(inode->i_sb, __func__,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
@@ -3415,7 +3414,7 @@ void ext4_dirty_inode(struct inode *inode)
 		current_handle->h_transaction != handle->h_transaction) {
 		/* This task has a transaction open against a different fs */
 		printk(KERN_EMERG "%s: transactions do not match!\n",
-		       __FUNCTION__);
+		       __func__);
 	} else {
 		jbd_debug(5, "marking dirty.  outer handle=%p\n",
 				current_handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 25b13ede808..7a6c2f1faba 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -10,17 +10,17 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/capability.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/time.h>
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 
-int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-		unsigned long arg)
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
@@ -277,9 +277,6 @@ setversion_out:
 #ifdef CONFIG_COMPAT
 long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
 	case EXT4_IOC32_GETFLAGS:
@@ -319,9 +316,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	default:
 		return -ENOIOCTLCMD;
 	}
-	lock_kernel();
-	ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
+	return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ef97f19c2f9..fbec2ef9379 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,21 +21,7 @@
  * mballoc.c contains the multiblocks allocation routines
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/proc_fs.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-#include <linux/version.h>
-#include "group.h"
-
+#include "mballoc.h"
 /*
  * MUSTDO:
  *   - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -345,288 +331,6 @@
  *
  */
 
-/*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-
-/*
- */
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...)	printk(fmt, ##a)
-#else
-#define mb_debug(fmt, a...)
-#endif
-
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
-#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
-#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD		4	/* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE		8	/* free */
-
-#define EXT4_MB_HISTORY_DEFAULT		(EXT4_MB_HISTORY_ALLOC | \
-					 EXT4_MB_HISTORY_PREALLOC)
-
-/*
- * How long mballoc can look for a best extent (in found extents)
- */
-#define MB_DEFAULT_MAX_TO_SCAN		200
-
-/*
- * How long mballoc must look for a best extent
- */
-#define MB_DEFAULT_MIN_TO_SCAN		10
-
-/*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN	5
-
-/*
- * with 'ext4_mb_stats' allocator will collect stats that will be
- * shown at umount. The collecting costs though!
- */
-#define MB_DEFAULT_STATS		1
-
-/*
- * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
- * by the stream allocator, which purpose is to pack requests
- * as close each to other as possible to produce smooth I/O traffic
- * We use locality group prealloc space for stream request.
- * We can tune the same via /proc/fs/ext4/<parition>/stream_req
- */
-#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
-
-/*
- * for which requests use 2^N search using buddies
- */
-#define MB_DEFAULT_ORDER2_REQS		2
-
-/*
- * default group prealloc size 512 blocks
- */
-#define MB_DEFAULT_GROUP_PREALLOC	512
-
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS	30
-
-struct ext4_free_metadata {
-	ext4_group_t group;
-	unsigned short num;
-	ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
-	struct list_head list;
-};
-
-struct ext4_group_info {
-	unsigned long	bb_state;
-	unsigned long	bb_tid;
-	struct ext4_free_metadata *bb_md_cur;
-	unsigned short	bb_first_free;
-	unsigned short	bb_free;
-	unsigned short	bb_fragments;
-	struct		list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-	void		*bb_bitmap;
-#endif
-	unsigned short	bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
-
-#define EXT4_MB_GRP_NEED_INIT(grp)	\
-	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
-struct ext4_prealloc_space {
-	struct list_head	pa_inode_list;
-	struct list_head	pa_group_list;
-	union {
-		struct list_head pa_tmp_list;
-		struct rcu_head	pa_rcu;
-	} u;
-	spinlock_t		pa_lock;
-	atomic_t		pa_count;
-	unsigned		pa_deleted;
-	ext4_fsblk_t		pa_pstart;	/* phys. block */
-	ext4_lblk_t		pa_lstart;	/* log. block */
-	unsigned short		pa_len;		/* len of preallocated chunk */
-	unsigned short		pa_free;	/* how many blocks are free */
-	unsigned short		pa_linear;	/* consumed in one direction
-						 * strictly, for grp prealloc */
-	spinlock_t		*pa_obj_lock;
-	struct inode		*pa_inode;	/* hack, for history only */
-};
-
-
-struct ext4_free_extent {
-	ext4_lblk_t fe_logical;
-	ext4_grpblk_t fe_start;
-	ext4_group_t fe_group;
-	int fe_len;
-};
-
-/*
- * Locality group:
- *   we try to group all related changes together
- *   so that writeback can flush/allocate them together as well
- */
-struct ext4_locality_group {
-	/* for allocator */
-	struct mutex		lg_mutex;	/* to serialize allocates */
-	struct list_head	lg_prealloc_list;/* list of preallocations */
-	spinlock_t		lg_prealloc_lock;
-};
-
-struct ext4_allocation_context {
-	struct inode *ac_inode;
-	struct super_block *ac_sb;
-
-	/* original request */
-	struct ext4_free_extent ac_o_ex;
-
-	/* goal request (after normalization) */
-	struct ext4_free_extent ac_g_ex;
-
-	/* the best found extent */
-	struct ext4_free_extent ac_b_ex;
-
-	/* copy of the bext found extent taken before preallocation efforts */
-	struct ext4_free_extent ac_f_ex;
-
-	/* number of iterations done. we have to track to limit searching */
-	unsigned long ac_ex_scanned;
-	__u16 ac_groups_scanned;
-	__u16 ac_found;
-	__u16 ac_tail;
-	__u16 ac_buddy;
-	__u16 ac_flags;		/* allocation hints */
-	__u8 ac_status;
-	__u8 ac_criteria;
-	__u8 ac_repeats;
-	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
-				 * N > 0, the field stores N, otherwise 0 */
-	__u8 ac_op;		/* operation, for history only */
-	struct page *ac_bitmap_page;
-	struct page *ac_buddy_page;
-	struct ext4_prealloc_space *ac_pa;
-	struct ext4_locality_group *ac_lg;
-};
-
-#define AC_STATUS_CONTINUE	1
-#define AC_STATUS_FOUND		2
-#define AC_STATUS_BREAK		3
-
-struct ext4_mb_history {
-	struct ext4_free_extent orig;	/* orig allocation */
-	struct ext4_free_extent goal;	/* goal allocation */
-	struct ext4_free_extent result;	/* result allocation */
-	unsigned pid;
-	unsigned ino;
-	__u16 found;	/* how many extents have been found */
-	__u16 groups;	/* how many groups have been scanned */
-	__u16 tail;	/* what tail broke some buddy */
-	__u16 buddy;	/* buddy the tail ^^^ broke */
-	__u16 flags;
-	__u8 cr:3;	/* which phase the result extent was found at */
-	__u8 op:4;
-	__u8 merged:1;
-};
-
-struct ext4_buddy {
-	struct page *bd_buddy_page;
-	void *bd_buddy;
-	struct page *bd_bitmap_page;
-	void *bd_bitmap;
-	struct ext4_group_info *bd_info;
-	struct super_block *bd_sb;
-	__u16 bd_blkbits;
-	ext4_group_t bd_group;
-};
-#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
-
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-	return;
-}
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
-#endif
-
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
-static struct proc_dir_entry *proc_root_ext4;
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp);
-
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-					ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-					struct ext4_buddy *e4b, sector_t block,
-					int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-			struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
-}
-
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
-					struct ext4_free_extent *fex)
-{
-	ext4_fsblk_t block;
-
-	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-			+ fex->fe_start
-			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-	return block;
-}
-
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
 #if BITS_PER_LONG == 64
@@ -736,7 +440,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 
-			ext4_error(sb, __FUNCTION__, "double-free of inode"
+			ext4_error(sb, __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %lu)\n",
 				   inode ? inode->i_ino : 0, blocknr,
 				   first + i, e4b->bd_group);
@@ -898,17 +602,17 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	list_for_each(cur, &grp->bb_prealloc_list) {
 		ext4_group_t groupnr;
 		struct ext4_prealloc_space *pa;
-		pa = list_entry(cur, struct ext4_prealloc_space, group_list);
-		ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
-		for (i = 0; i < pa->len; i++)
+		for (i = 0; i < pa->pa_len; i++)
 			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 	}
 	return 0;
 }
 #undef MB_CHECK_ASSERT
 #define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
-					__FILE__, __FUNCTION__, __LINE__)
+					__FILE__, __func__, __LINE__)
 #else
 #define mb_check_buddy(e4b)
 #endif
@@ -982,7 +686,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 	grp->bb_fragments = fragments;
 
 	if (free != grp->bb_free) {
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
 			group, free, grp->bb_free);
 		/*
@@ -1168,8 +872,9 @@ out:
 	return err;
 }
 
-static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
-		struct ext4_buddy *e4b)
+static noinline_for_stack int
+ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+					struct ext4_buddy *e4b)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct inode *inode = sbi->s_buddy_cache;
@@ -1367,7 +1072,7 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 
-			ext4_error(sb, __FUNCTION__, "double-free of inode"
+			ext4_error(sb, __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %lu)\n",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
@@ -1848,7 +1553,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * free blocks even though group info says we
 			 * we have free blocks
 			 */
-			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
+			ext4_error(sb, __func__, "%d free blocks as per "
 					"group info. But bitmap says 0\n",
 					free);
 			break;
@@ -1857,7 +1562,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
-			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
+			ext4_error(sb, __func__, "%d free blocks as per "
 					"group info. But got %d blocks\n",
 					free, ex.fe_len);
 			/*
@@ -1965,7 +1670,8 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
-static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
 	ext4_group_t group;
 	ext4_group_t i;
@@ -2449,17 +2155,10 @@ static void ext4_mb_history_init(struct super_block *sb)
 	int i;
 
 	if (sbi->s_mb_proc != NULL) {
-		struct proc_dir_entry *p;
-		p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
-		if (p) {
-			p->proc_fops = &ext4_mb_seq_history_fops;
-			p->data = sb;
-		}
-		p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
-		if (p) {
-			p->proc_fops = &ext4_mb_seq_groups_fops;
-			p->data = sb;
-		}
+		proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc,
+				 &ext4_mb_seq_history_fops, sb);
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc,
+				 &ext4_mb_seq_groups_fops, sb);
 	}
 
 	sbi->s_mb_history_max = 1000;
@@ -2472,7 +2171,8 @@ static void ext4_mb_history_init(struct super_block *sb)
 	/* if we can't allocate history, then we simple won't use it */
 }
 
-static void ext4_mb_store_history(struct ext4_allocation_context *ac)
+static noinline_for_stack void
+ext4_mb_store_history(struct ext4_allocation_context *ac)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_mb_history h;
@@ -2572,13 +2272,13 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
 		if (meta_group_info[j] == NULL) {
 			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-			i--;
 			goto err_freebuddy;
 		}
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
 				"EXT4-fs: can't read descriptor %lu\n", i);
+			i++;
 			goto err_freebuddy;
 		}
 		memset(meta_group_info[j], 0, len);
@@ -2618,13 +2318,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	return 0;
 
 err_freebuddy:
-	while (i >= 0) {
+	while (i-- > 0)
 		kfree(ext4_get_group_info(sb, i));
-		i--;
-	}
 	i = num_meta_group_infos;
 err_freemeta:
-	while (--i >= 0)
+	while (i-- > 0)
 		kfree(sbi->s_group_info[i]);
 	iput(sbi->s_buddy_cache);
 err_freesgi:
@@ -2808,7 +2506,8 @@ int ext4_mb_release(struct super_block *sb)
 	return 0;
 }
 
-static void ext4_mb_free_committed_blocks(struct super_block *sb)
+static noinline_for_stack void
+ext4_mb_free_committed_blocks(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int err;
@@ -2867,7 +2566,6 @@ static void ext4_mb_free_committed_blocks(struct super_block *sb)
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
 
-#define EXT4_ROOT			"ext4"
 #define EXT4_MB_STATS_NAME		"stats"
 #define EXT4_MB_MAX_TO_SCAN_NAME	"max_to_scan"
 #define EXT4_MB_MIN_TO_SCAN_NAME	"min_to_scan"
@@ -3007,9 +2705,9 @@ int __init init_ext4_mballoc(void)
 		return -ENOMEM;
 	}
 #ifdef CONFIG_PROC_FS
-	proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
+	proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
 	if (proc_root_ext4 == NULL)
-		printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
+		printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
 #endif
 	return 0;
 }
@@ -3020,7 +2718,7 @@ void exit_ext4_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 #ifdef CONFIG_PROC_FS
-	remove_proc_entry(EXT4_ROOT, proc_root_fs);
+	remove_proc_entry("fs/ext4", NULL);
 #endif
 }
 
@@ -3029,7 +2727,8 @@ void exit_ext4_mballoc(void)
  * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
  * Returns 0 if success or error code
  */
-static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+static noinline_for_stack int
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 				handle_t *handle)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -3078,7 +2777,7 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 			in_range(block, ext4_inode_table(sb, gdp),
 				EXT4_SB(sb)->s_itb_per_group)) {
 
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			   "Allocating block in system zone - block = %llu",
 			   block);
 	}
@@ -3102,9 +2801,7 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 						ac->ac_b_ex.fe_group,
 						gdp));
 	}
-	gdp->bg_free_blocks_count =
-		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
-				- ac->ac_b_ex.fe_len);
+	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -3138,7 +2835,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
 		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
 	else
 		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
-	mb_debug("#%u: goal %lu blocks for locality group\n",
+	mb_debug("#%u: goal %u blocks for locality group\n",
 		current->pid, ac->ac_g_ex.fe_len);
 }
 
@@ -3146,15 +2843,16 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
  * Normalization means making request better in terms of
  * size and alignment
  */
-static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+static noinline_for_stack void
+ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 				struct ext4_allocation_request *ar)
 {
 	int bsbits, max;
 	ext4_lblk_t end;
-	struct list_head *cur;
 	loff_t size, orig_size, start_off;
 	ext4_lblk_t start, orig_start;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_prealloc_space *pa;
 
 	/* do normalize only data requests, metadata requests
 	   do not need preallocation */
@@ -3240,12 +2938,9 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	/* check we don't cross already preallocated blocks */
 	rcu_read_lock();
-	list_for_each_rcu(cur, &ei->i_prealloc_list) {
-		struct ext4_prealloc_space *pa;
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
 		unsigned long pa_end;
 
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
-
 		if (pa->pa_deleted)
 			continue;
 		spin_lock(&pa->pa_lock);
@@ -3287,10 +2982,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	/* XXX: extra loop to check we really don't overlap preallocations */
 	rcu_read_lock();
-	list_for_each_rcu(cur, &ei->i_prealloc_list) {
-		struct ext4_prealloc_space *pa;
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
 		unsigned long pa_end;
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0) {
 			pa_end = pa->pa_lstart + pa->pa_len;
@@ -3382,7 +3075,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 	BUG_ON(pa->pa_free < len);
 	pa->pa_free -= len;
 
-	mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
+	mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
 }
 
 /*
@@ -3412,12 +3105,12 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 /*
  * search goal blocks in preallocated space
  */
-static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 	struct ext4_locality_group *lg;
 	struct ext4_prealloc_space *pa;
-	struct list_head *cur;
 
 	/* only data can be preallocated */
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3425,8 +3118,7 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 
 	/* first, try per-file preallocation */
 	rcu_read_lock();
-	list_for_each_rcu(cur, &ei->i_prealloc_list) {
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
 
 		/* all fields in this condition don't change,
 		 * so we can skip locking for them */
@@ -3458,8 +3150,7 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 		return 0;
 
 	rcu_read_lock();
-	list_for_each_rcu(cur, &lg->lg_prealloc_list) {
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
 			atomic_inc(&pa->pa_count);
@@ -3579,7 +3270,8 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 /*
  * creates new preallocated space for given inode
  */
-static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
 	struct ext4_prealloc_space *pa;
@@ -3666,7 +3358,8 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 /*
  * creates new preallocated space for locality group inodes belongs to
  */
-static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
 	struct ext4_locality_group *lg;
@@ -3739,11 +3432,11 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  * the caller MUST hold group/inode locks.
  * TODO: optimize the case when there are no in-core structures yet
  */
-static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
-				struct buffer_head *bitmap_bh,
-				struct ext4_prealloc_space *pa)
+static noinline_for_stack int
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+			struct ext4_prealloc_space *pa,
+			struct ext4_allocation_context *ac)
 {
-	struct ext4_allocation_context *ac;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned long end;
@@ -3759,8 +3452,6 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-
 	if (ac) {
 		ac->ac_sb = sb;
 		ac->ac_inode = pa->pa_inode;
@@ -3797,7 +3488,7 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 			pa, (unsigned long) pa->pa_lstart,
 			(unsigned long) pa->pa_pstart,
 			(unsigned long) pa->pa_len);
-		ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n",
+		ext4_error(sb, __func__, "free %u, pa_free %u\n",
 						free, pa->pa_free);
 		/*
 		 * pa is already deleted so we use the value obtained
@@ -3805,22 +3496,19 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 		 */
 	}
 	atomic_add(free, &sbi->s_mb_discarded);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 
 	return err;
 }
 
-static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-				struct ext4_prealloc_space *pa)
+static noinline_for_stack int
+ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+				struct ext4_prealloc_space *pa,
+				struct ext4_allocation_context *ac)
 {
-	struct ext4_allocation_context *ac;
 	struct super_block *sb = e4b->bd_sb;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-
 	if (ac)
 		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
@@ -3838,7 +3526,6 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 		ac->ac_b_ex.fe_len = pa->pa_len;
 		ac->ac_b_ex.fe_logical = 0;
 		ext4_mb_store_history(ac);
-		kmem_cache_free(ext4_ac_cachep, ac);
 	}
 
 	return 0;
@@ -3853,12 +3540,14 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
  * - how many do we discard
  *   1) how many requested
  */
-static int ext4_mb_discard_group_preallocations(struct super_block *sb,
+static noinline_for_stack int
+ext4_mb_discard_group_preallocations(struct super_block *sb,
 					ext4_group_t group, int needed)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_prealloc_space *pa, *tmp;
+	struct ext4_allocation_context *ac;
 	struct list_head list;
 	struct ext4_buddy e4b;
 	int err;
@@ -3886,6 +3575,7 @@ static int ext4_mb_discard_group_preallocations(struct super_block *sb,
 	grp = ext4_get_group_info(sb, group);
 	INIT_LIST_HEAD(&list);
 
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 repeat:
 	ext4_lock_group(sb, group);
 	list_for_each_entry_safe(pa, tmp,
@@ -3940,9 +3630,9 @@ repeat:
 		spin_unlock(pa->pa_obj_lock);
 
 		if (pa->pa_linear)
-			ext4_mb_release_group_pa(&e4b, pa);
+			ext4_mb_release_group_pa(&e4b, pa, ac);
 		else
-			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
 
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3950,6 +3640,8 @@ repeat:
 
 out:
 	ext4_unlock_group(sb, group);
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
 	ext4_mb_release_desc(&e4b);
 	put_bh(bitmap_bh);
 	return free;
@@ -3970,6 +3662,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_prealloc_space *pa, *tmp;
+	struct ext4_allocation_context *ac;
 	ext4_group_t group = 0;
 	struct list_head list;
 	struct ext4_buddy e4b;
@@ -3984,6 +3677,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
 
 	INIT_LIST_HEAD(&list);
 
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 repeat:
 	/* first, collect all pa's in the inode */
 	spin_lock(&ei->i_prealloc_lock);
@@ -4048,7 +3742,7 @@ repeat:
 
 		ext4_lock_group(sb, group);
 		list_del(&pa->pa_group_list);
-		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
 		ext4_unlock_group(sb, group);
 
 		ext4_mb_release_desc(&e4b);
@@ -4057,6 +3751,8 @@ repeat:
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
 }
 
 /*
@@ -4116,7 +3812,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 			printk(KERN_ERR "PA:%lu:%d:%u \n", i,
 							start, pa->pa_len);
 		}
-		ext4_lock_group(sb, i);
+		ext4_unlock_group(sb, i);
 
 		if (grp->bb_free == 0)
 			continue;
@@ -4175,7 +3871,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	mutex_lock(&ac->ac_lg->lg_mutex);
 }
 
-static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+static noinline_for_stack int
+ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 				struct ext4_allocation_request *ar)
 {
 	struct super_block *sb = ar->inode->i_sb;
@@ -4406,7 +4103,8 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
 	ext4_mb_free_committed_blocks(sb);
 }
 
-static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+static noinline_for_stack int
+ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 			  ext4_group_t group, ext4_grpblk_t block, int count)
 {
 	struct ext4_group_info *db = e4b->bd_info;
@@ -4497,7 +4195,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > ext4_blocks_count(es)) {
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			    "Freeing blocks not in datazone - "
 			    "block = %lu, count = %lu", block, count);
 		goto error_return;
@@ -4538,7 +4236,7 @@ do_more:
 	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
 		      EXT4_SB(sb)->s_itb_per_group)) {
 
-		ext4_error(sb, __FUNCTION__,
+		ext4_error(sb, __func__,
 			   "Freeing blocks in system zone - "
 			   "Block = %lu, count = %lu", block, count);
 	}
@@ -4596,8 +4294,7 @@ do_more:
 	}
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	gdp->bg_free_blocks_count =
-		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+	le16_add_cpu(&gdp->bg_free_blocks_count, count);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
new file mode 100644
index 00000000000..bfe6add46bc
--- /dev/null
+++ b/fs/ext4/mballoc.h
@@ -0,0 +1,304 @@
+/*
+ *  fs/ext4/mballoc.h
+ *
+ *  Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "group.h"
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define mb_debug(fmt, a...)	printk(fmt, ##a)
+#else
+#define mb_debug(fmt, a...)
+#endif
+
+/*
+ * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
+ * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
+ */
+#define EXT4_MB_HISTORY
+#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
+#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
+#define EXT4_MB_HISTORY_DISCARD		4	/* preallocation discarded */
+#define EXT4_MB_HISTORY_FREE		8	/* free */
+
+#define EXT4_MB_HISTORY_DEFAULT		(EXT4_MB_HISTORY_ALLOC | \
+					 EXT4_MB_HISTORY_PREALLOC)
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN		200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN		10
+
+/*
+ * How many groups mballoc will scan looking for the best chunk
+ */
+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN	5
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS		1
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS		2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC	512
+
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+
+#ifdef EXT4_BB_MAX_BLOCKS
+#undef EXT4_BB_MAX_BLOCKS
+#endif
+#define EXT4_BB_MAX_BLOCKS	30
+
+struct ext4_free_metadata {
+	ext4_group_t group;
+	unsigned short num;
+	ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
+	struct list_head list;
+};
+
+struct ext4_group_info {
+	unsigned long	bb_state;
+	unsigned long	bb_tid;
+	struct ext4_free_metadata *bb_md_cur;
+	unsigned short	bb_first_free;
+	unsigned short	bb_free;
+	unsigned short	bb_fragments;
+	struct		list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void		*bb_bitmap;
+#endif
+	unsigned short	bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
+#define EXT4_GROUP_INFO_LOCKED_BIT	1
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+
+struct ext4_prealloc_space {
+	struct list_head	pa_inode_list;
+	struct list_head	pa_group_list;
+	union {
+		struct list_head pa_tmp_list;
+		struct rcu_head	pa_rcu;
+	} u;
+	spinlock_t		pa_lock;
+	atomic_t		pa_count;
+	unsigned		pa_deleted;
+	ext4_fsblk_t		pa_pstart;	/* phys. block */
+	ext4_lblk_t		pa_lstart;	/* log. block */
+	unsigned short		pa_len;		/* len of preallocated chunk */
+	unsigned short		pa_free;	/* how many blocks are free */
+	unsigned short		pa_linear;	/* consumed in one direction
+						 * strictly, for grp prealloc */
+	spinlock_t		*pa_obj_lock;
+	struct inode		*pa_inode;	/* hack, for history only */
+};
+
+
+struct ext4_free_extent {
+	ext4_lblk_t fe_logical;
+	ext4_grpblk_t fe_start;
+	ext4_group_t fe_group;
+	int fe_len;
+};
+
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ */
+struct ext4_locality_group {
+	/* for allocator */
+	struct mutex		lg_mutex;	/* to serialize allocates */
+	struct list_head	lg_prealloc_list;/* list of preallocations */
+	spinlock_t		lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+	struct inode *ac_inode;
+	struct super_block *ac_sb;
+
+	/* original request */
+	struct ext4_free_extent ac_o_ex;
+
+	/* goal request (after normalization) */
+	struct ext4_free_extent ac_g_ex;
+
+	/* the best found extent */
+	struct ext4_free_extent ac_b_ex;
+
+	/* copy of the bext found extent taken before preallocation efforts */
+	struct ext4_free_extent ac_f_ex;
+
+	/* number of iterations done. we have to track to limit searching */
+	unsigned long ac_ex_scanned;
+	__u16 ac_groups_scanned;
+	__u16 ac_found;
+	__u16 ac_tail;
+	__u16 ac_buddy;
+	__u16 ac_flags;		/* allocation hints */
+	__u8 ac_status;
+	__u8 ac_criteria;
+	__u8 ac_repeats;
+	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
+				 * N > 0, the field stores N, otherwise 0 */
+	__u8 ac_op;		/* operation, for history only */
+	struct page *ac_bitmap_page;
+	struct page *ac_buddy_page;
+	struct ext4_prealloc_space *ac_pa;
+	struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE	1
+#define AC_STATUS_FOUND		2
+#define AC_STATUS_BREAK		3
+
+struct ext4_mb_history {
+	struct ext4_free_extent orig;	/* orig allocation */
+	struct ext4_free_extent goal;	/* goal allocation */
+	struct ext4_free_extent result;	/* result allocation */
+	unsigned pid;
+	unsigned ino;
+	__u16 found;	/* how many extents have been found */
+	__u16 groups;	/* how many groups have been scanned */
+	__u16 tail;	/* what tail broke some buddy */
+	__u16 buddy;	/* buddy the tail ^^^ broke */
+	__u16 flags;
+	__u8 cr:3;	/* which phase the result extent was found at */
+	__u8 op:4;
+	__u8 merged:1;
+};
+
+struct ext4_buddy {
+	struct page *bd_buddy_page;
+	void *bd_buddy;
+	struct page *bd_bitmap_page;
+	void *bd_bitmap;
+	struct ext4_group_info *bd_info;
+	struct super_block *bd_sb;
+	__u16 bd_blkbits;
+	ext4_group_t bd_group;
+};
+#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
+#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
+
+#ifndef EXT4_MB_HISTORY
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+	return;
+}
+#else
+static void ext4_mb_store_history(struct ext4_allocation_context *ac);
+#endif
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+static struct proc_dir_entry *proc_root_ext4;
+struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group);
+static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
+static void ext4_mb_free_committed_blocks(struct super_block *);
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+					struct ext4_buddy *e4b, sector_t block,
+					int count);
+static void ext4_mb_put_pa(struct ext4_allocation_context *,
+			struct super_block *, struct ext4_prealloc_space *pa);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+						&(grinfo->bb_state));
+}
+
+static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+					struct ext4_free_extent *fex)
+{
+	ext4_fsblk_t block;
+
+	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+			+ fex->fe_start
+			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+	return block;
+}
+#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 5c1e27de775..b9e077ba07e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,8 +13,8 @@
  */
 
 #include <linux/module.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs_extents.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
@@ -327,7 +327,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
 }
 
 static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
-				struct inode *tmp_inode)
+						struct inode *tmp_inode)
 {
 	int retval;
 	__le32	i_data[3];
@@ -339,7 +339,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * i_data field of the original inode
 	 */
 	retval = ext4_journal_extend(handle, 1);
-	if (retval != 0) {
+	if (retval) {
 		retval = ext4_journal_restart(handle, 1);
 		if (retval)
 			goto err_out;
@@ -351,6 +351,18 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
+	 * if EXT4_EXT_MIGRATE is cleared a block allocation
+	 * happened after we started the migrate. We need to
+	 * fail the migrate
+	 */
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+		retval = -EAGAIN;
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto err_out;
+	} else
+		EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+							~EXT4_EXT_MIGRATE;
+	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
 	 */
@@ -508,6 +520,17 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	 * switch the inode format to prevent read.
 	 */
 	mutex_lock(&(inode->i_mutex));
+	/*
+	 * Even though we take i_mutex we can still cause block allocation
+	 * via mmap write to holes. If we have allocated new blocks we fail
+	 * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
+	 * The flag is updated with i_data_sem held to prevent racing with
+	 * block allocation.
+	 */
+	down_read((&EXT4_I(inode)->i_data_sem));
+	EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+	up_read((&EXT4_I(inode)->i_data_sem));
+
 	handle = ext4_journal_start(inode, 1);
 
 	ei = EXT4_I(inode);
@@ -559,9 +582,15 @@ err_out:
 		 * tmp_inode
 		 */
 		free_ext_block(handle, tmp_inode);
-	else
-		retval = ext4_ext_swap_inode_data(handle, inode,
-							tmp_inode);
+	else {
+		retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+		if (retval)
+			/*
+			 * if we fail to swap inode data free the extent
+			 * details of the tmp inode
+			 */
+			free_ext_block(handle, tmp_inode);
+	}
 
 	/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
 	if (ext4_journal_extend(handle, 1) != 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 28aa2ed4297..ab16beaa830 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -28,14 +28,14 @@
 #include <linux/pagemap.h>
 #include <linux/jbd2.h>
 #include <linux/time.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
 
 #include "namei.h"
 #include "xattr.h"
@@ -57,10 +57,15 @@ static struct buffer_head *ext4_append(handle_t *handle,
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
-	if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
+	bh = ext4_bread(handle, inode, *block, 1, err);
+	if (bh) {
 		inode->i_size += inode->i_sb->s_blocksize;
 		EXT4_I(inode)->i_disksize = inode->i_size;
-		ext4_journal_get_write_access(handle,bh);
+		*err = ext4_journal_get_write_access(handle, bh);
+		if (*err) {
+			brelse(bh);
+			bh = NULL;
+		}
 	}
 	return bh;
 }
@@ -348,7 +353,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
 	    root->info.hash_version != DX_HASH_LEGACY) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
+		ext4_warning(dir->i_sb, __func__,
 			     "Unrecognised inode hash code %d",
 			     root->info.hash_version);
 		brelse(bh);
@@ -362,7 +367,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
+		ext4_warning(dir->i_sb, __func__,
 			     "Unimplemented inode hash flags: %#06x",
 			     root->info.unused_flags);
 		brelse(bh);
@@ -371,7 +376,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	}
 
 	if ((indirect = root->info.indirect_levels) > 1) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
+		ext4_warning(dir->i_sb, __func__,
 			     "Unimplemented inode hash depth: %#06x",
 			     root->info.indirect_levels);
 		brelse(bh);
@@ -384,7 +389,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 
 	if (dx_get_limit(entries) != dx_root_limit(dir,
 						   root->info.info_length)) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
+		ext4_warning(dir->i_sb, __func__,
 			     "dx entry: limit != root limit");
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -396,7 +401,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	{
 		count = dx_get_count(entries);
 		if (!count || count > dx_get_limit(entries)) {
-			ext4_warning(dir->i_sb, __FUNCTION__,
+			ext4_warning(dir->i_sb, __func__,
 				     "dx entry: no count or count > limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -441,7 +446,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 			goto fail2;
 		at = entries = ((struct dx_node *) bh->b_data)->entries;
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
-			ext4_warning(dir->i_sb, __FUNCTION__,
+			ext4_warning(dir->i_sb, __func__,
 				     "dx entry: limit != node limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -457,7 +462,7 @@ fail2:
 	}
 fail:
 	if (*err == ERR_BAD_DX_DIR)
-		ext4_warning(dir->i_sb, __FUNCTION__,
+		ext4_warning(dir->i_sb, __func__,
 			     "Corrupt dir inode %ld, running e2fsck is "
 			     "recommended.", dir->i_ino);
 	return NULL;
@@ -914,7 +919,7 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
-			ext4_error(sb, __FUNCTION__, "reading directory #%lu "
+			ext4_error(sb, __func__, "reading directory #%lu "
 				   "offset %lu", dir->i_ino,
 				   (unsigned long)block);
 			brelse(bh);
@@ -1007,7 +1012,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 		retval = ext4_htree_next_block(dir, hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 			     "error reading index page in directory #%lu",
 			     dir->i_ino);
 			*err = retval;
@@ -1532,7 +1537,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 				     "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
@@ -1860,11 +1865,11 @@ static int empty_dir (struct inode * inode)
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
 	    !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
 		if (err)
-			ext4_error(inode->i_sb, __FUNCTION__,
+			ext4_error(inode->i_sb, __func__,
 				   "error %d reading directory #%lu offset 0",
 				   err, inode->i_ino);
 		else
-			ext4_warning(inode->i_sb, __FUNCTION__,
+			ext4_warning(inode->i_sb, __func__,
 				     "bad directory (dir #%lu) - no data block",
 				     inode->i_ino);
 		return 1;
@@ -1893,7 +1898,7 @@ static int empty_dir (struct inode * inode)
 				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
 			if (!bh) {
 				if (err)
-					ext4_error(sb, __FUNCTION__,
+					ext4_error(sb, __func__,
 						   "error %d reading directory"
 						   " #%lu offset %lu",
 						   err, inode->i_ino, offset);
@@ -2217,6 +2222,8 @@ retry:
 			goto out_stop;
 		}
 	} else {
+		/* clear the extent format for fast symlink */
+		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
 		inode->i_op = &ext4_fast_symlink_inode_operations;
 		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
 		inode->i_size = l-1;
@@ -2347,6 +2354,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 					      EXT4_FEATURE_INCOMPAT_FILETYPE))
 			new_de->file_type = old_de->file_type;
 		new_dir->i_version++;
+		new_dir->i_ctime = new_dir->i_mtime =
+					ext4_current_time(new_dir);
+		ext4_mark_inode_dirty(handle, new_dir);
 		BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
 		ext4_journal_dirty_metadata(handle, new_bh);
 		brelse(new_bh);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e29efa0f9d6..9f086a6a472 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -11,11 +11,10 @@
 
 #define EXT4FS_DEBUG
 
-#include <linux/ext4_jbd2.h>
-
 #include <linux/errno.h>
 #include <linux/slab.h>
 
+#include "ext4_jbd2.h"
 #include "group.h"
 
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
@@ -50,63 +49,63 @@ static int verify_group_input(struct super_block *sb,
 
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
 	if (group != sbi->s_groups_count)
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Cannot add at group %u (only %lu groups)",
 			     input->group, sbi->s_groups_count);
 	else if (offset != 0)
-			ext4_warning(sb, __FUNCTION__, "Last group not full");
+			ext4_warning(sb, __func__, "Last group not full");
 	else if (input->reserved_blocks > input->blocks_count / 5)
-		ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+		ext4_warning(sb, __func__, "Reserved blocks too high (%u)",
 			     input->reserved_blocks);
 	else if (free_blocks_count < 0)
-		ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
+		ext4_warning(sb, __func__, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Cannot read last block (%llu)",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Block bitmap not in group (block %llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (outside(input->inode_bitmap, start, end))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Inode bitmap not in group (block %llu)",
 			     (unsigned long long)input->inode_bitmap);
 	else if (outside(input->inode_table, start, end) ||
 	         outside(itend - 1, start, end))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Inode table not in group (blocks %llu-%llu)",
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Block bitmap same as inode bitmap (%llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Block bitmap (%llu) in inode table (%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Inode bitmap (%llu) in inode table (%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->block_bitmap, start, metaend))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Block bitmap (%llu) in GDT table"
 			     " (%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Inode bitmap (%llu) in GDT table"
 			     " (%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
 	         inside(itend - 1, start, metaend))
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Inode table (%llu-%llu) overlaps"
 			     "GDT table (%llu-%llu)",
 			     (unsigned long long)input->inode_table,
@@ -368,7 +367,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) !=
 		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 				     "reserved GDT %llu"
 				     " missing grp %d (%llu)",
 				     blk, grp,
@@ -424,7 +423,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	 */
 	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			"won't resize using backup superblock at %llu",
 			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
 		return -EPERM;
@@ -448,7 +447,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	data = (__le32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "new group %u GDT block %llu not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
@@ -469,10 +468,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		goto exit_dindj;
 
 	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
-			GFP_KERNEL);
+			GFP_NOFS);
 	if (!n_group_desc) {
 		err = -ENOMEM;
-		ext4_warning (sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			      "not enough memory for %lu groups", gdb_num + 1);
 		goto exit_inode;
 	}
@@ -502,8 +501,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	EXT4_SB(sb)->s_gdb_count++;
 	kfree(o_group_desc);
 
-	es->s_reserved_gdt_blocks =
-		cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
+	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
 	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
 
 	return 0;
@@ -553,7 +551,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	int res, i;
 	int err;
 
-	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
 	if (!primary)
 		return -ENOMEM;
 
@@ -571,7 +569,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	/* Get each reserved primary GDT block and verify it holds backups */
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 				     "reserved block %llu"
 				     " not at offset %ld",
 				     blk,
@@ -715,7 +713,7 @@ static void update_backups(struct super_block *sb,
 	 */
 exit_err:
 	if (err) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "can't update backup for group %lu (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
@@ -755,33 +753,33 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "Can't resize non-sparse filesystem further");
 		return -EPERM;
 	}
 
 	if (ext4_blocks_count(es) + input->blocks_count <
 	    ext4_blocks_count(es)) {
-		ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
+		ext4_warning(sb, __func__, "blocks_count overflow\n");
 		return -EINVAL;
 	}
 
 	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
 	    le32_to_cpu(es->s_inodes_count)) {
-		ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
+		ext4_warning(sb, __func__, "inodes_count overflow\n");
 		return -EINVAL;
 	}
 
 	if (reserved_gdb || gdb_off == 0) {
 		if (!EXT4_HAS_COMPAT_FEATURE(sb,
 					     EXT4_FEATURE_COMPAT_RESIZE_INODE)){
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
 		inode = ext4_iget(sb, EXT4_RESIZE_INO);
 		if (IS_ERR(inode)) {
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 				     "Error opening resize inode");
 			return PTR_ERR(inode);
 		}
@@ -810,7 +808,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	lock_super(sb);
 	if (input->group != sbi->s_groups_count) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_journal;
@@ -877,8 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 */
 	ext4_blocks_count_set(es, ext4_blocks_count(es) +
 		input->blocks_count);
-	es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
-		EXT4_INODES_PER_GROUP(sb));
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
 
 	/*
 	 * We need to protect s_groups_count against other CPUs seeing
@@ -977,13 +974,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb, __func__,
 			"CONFIG_LBD not enabled\n");
 		return -EINVAL;
 	}
 
 	if (n_blocks_count < o_blocks_count) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "can't shrink FS - resize aborted");
 		return -EBUSY;
 	}
@@ -992,7 +989,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
 
 	if (last == 0) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "need to use ext2online to resize further");
 		return -EPERM;
 	}
@@ -1000,7 +997,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
 
 	if (o_blocks_count + add < o_blocks_count) {
-		ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
+		ext4_warning(sb, __func__, "blocks_count overflow");
 		return -EINVAL;
 	}
 
@@ -1008,7 +1005,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		add = n_blocks_count - o_blocks_count;
 
 	if (o_blocks_count + add < n_blocks_count)
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "will only finish group (%llu"
 			     " blocks, %u new)",
 			     o_blocks_count + add, add);
@@ -1016,7 +1013,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	/* See if the device is actually as big as what was requested */
 	bh = sb_bread(sb, o_blocks_count + add -1);
 	if (!bh) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "can't read last block, resize aborted");
 		return -ENOSPC;
 	}
@@ -1028,13 +1025,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	handle = ext4_journal_start_sb(sb, 3);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
-		ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
+		ext4_warning(sb, __func__, "error %d on journal start", err);
 		goto exit_put;
 	}
 
 	lock_super(sb);
 	if (o_blocks_count != ext4_blocks_count(es)) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
 		unlock_super(sb);
 		ext4_journal_stop(handle);
@@ -1044,7 +1041,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
 	if ((err = ext4_journal_get_write_access(handle,
 						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, __FUNCTION__,
+		ext4_warning(sb, __func__,
 			     "error %d on journal write access", err);
 		unlock_super(sb);
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c81a8e759ba..52dd0679a4e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -21,8 +21,6 @@
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
@@ -38,9 +36,10 @@
 #include <linux/seq_file.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
-
 #include <asm/uaccess.h>
 
+#include "ext4.h"
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "namei.h"
@@ -135,7 +134,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	 * take the FS itself readonly cleanly. */
 	journal = EXT4_SB(sb)->s_journal;
 	if (is_journal_aborted(journal)) {
-		ext4_abort(sb, __FUNCTION__,
+		ext4_abort(sb, __func__,
 			   "Detected aborted journal");
 		return ERR_PTR(-EROFS);
 	}
@@ -355,7 +354,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 		return;
 
-	ext4_warning(sb, __FUNCTION__,
+	ext4_warning(sb, __func__,
 		     "updating to rev %d because of new feature flag, "
 		     "running e2fsck is recommended",
 		     EXT4_DYNAMIC_REV);
@@ -945,8 +944,8 @@ static match_table_t tokens = {
 	{Opt_mballoc, "mballoc"},
 	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
-	{Opt_err, NULL},
 	{Opt_resize, "resize"},
+	{Opt_err, NULL},
 };
 
 static ext4_fsblk_t get_sb_block(void **data)
@@ -1388,11 +1387,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		 * a plain journaled filesystem we can keep it set as
 		 * valid forever! :)
 		 */
-	es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
+	es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 #endif
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
-	es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
 	ext4_update_dynamic_rev(sb);
 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -1485,36 +1484,33 @@ static int ext4_check_descriptors(struct super_block *sb)
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block)
 		{
-			ext4_error (sb, "ext4_check_descriptors",
-				    "Block bitmap for group %lu"
-				    " not in group (block %llu)!",
-				    i, block_bitmap);
+			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			       "Block bitmap for group %lu not in group "
+			       "(block %llu)!", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block)
 		{
-			ext4_error (sb, "ext4_check_descriptors",
-				    "Inode bitmap for group %lu"
-				    " not in group (block %llu)!",
-				    i, inode_bitmap);
+			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			       "Inode bitmap for group %lu not in group "
+			       "(block %llu)!", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block)
 		{
-			ext4_error (sb, "ext4_check_descriptors",
-				    "Inode table for group %lu"
-				    " not in group (block %llu)!",
-				    i, inode_table);
+			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			       "Inode table for group %lu not in group "
+			       "(block %llu)!", i, inode_table);
 			return 0;
 		}
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-			ext4_error(sb, __FUNCTION__,
-				   "Checksum for group %lu failed (%u!=%u)\n",
-				    i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-				    gdp)), le16_to_cpu(gdp->bg_checksum));
+			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			       "Checksum for group %lu failed (%u!=%u)\n",
+			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+			       gdp)), le16_to_cpu(gdp->bg_checksum));
 			return 0;
 		}
 		if (!flexbg_flag)
@@ -1594,8 +1590,8 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 	while (es->s_last_orphan) {
 		struct inode *inode;
 
-		if (!(inode =
-		      ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+		if (IS_ERR(inode)) {
 			es->s_last_orphan = 0;
 			break;
 		}
@@ -1605,7 +1601,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %Ld bytes\n",
-				__FUNCTION__, inode->i_ino, inode->i_size);
+				__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
@@ -1613,7 +1609,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		} else {
 			printk(KERN_DEBUG
 				"%s: deleting unreferenced inode %lu\n",
-				__FUNCTION__, inode->i_ino);
+				__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
@@ -2699,9 +2695,9 @@ static void ext4_clear_journal_err(struct super_block * sb,
 		char nbuf[16];
 
 		errstr = ext4_decode_error(sb, j_errno, nbuf);
-		ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
+		ext4_warning(sb, __func__, "Filesystem error recorded "
 			     "from previous mount: %s", errstr);
-		ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
+		ext4_warning(sb, __func__, "Marking fs in need of "
 			     "filesystem check.");
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
@@ -2828,7 +2824,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
 	}
 
 	if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
-		ext4_abort(sb, __FUNCTION__, "Abort forced by user");
+		ext4_abort(sb, __func__, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3040,8 +3036,14 @@ static int ext4_dquot_drop(struct inode *inode)
 
 	/* We may delete quota structure so we need to reserve enough blocks */
 	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
+	if (IS_ERR(handle)) {
+		/*
+		 * We call dquot_drop() anyway to at least release references
+		 * to quota structures so that umount does not hang.
+		 */
+		dquot_drop(inode);
 		return PTR_ERR(handle);
+	}
 	ret = dquot_drop(inode);
 	err = ext4_journal_stop(handle);
 	if (!ret)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e6f9da4287c..e9178643dc0 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -19,8 +19,8 @@
 
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/namei.h>
+#include "ext4.h"
 #include "xattr.h"
 
 static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e9054c1c7d9..3fbc2c6c3d0 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -53,11 +53,11 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
 #include <linux/rwsem.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -92,6 +92,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
 						 struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
+static int ext4_xattr_list(struct inode *inode, char *buffer,
+			   size_t buffer_size);
 
 static struct mb_cache *ext4_xattr_cache;
 
@@ -225,7 +227,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-bad_block:	ext4_error(inode->i_sb, __FUNCTION__,
+bad_block:	ext4_error(inode->i_sb, __func__,
 			   "inode %lu: bad block %llu", inode->i_ino,
 			   EXT4_I(inode)->i_file_acl);
 		error = -EIO;
@@ -367,7 +369,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-		ext4_error(inode->i_sb, __FUNCTION__,
+		ext4_error(inode->i_sb, __func__,
 			   "inode %lu: bad block %llu", inode->i_ino,
 			   EXT4_I(inode)->i_file_acl);
 		error = -EIO;
@@ -420,7 +422,7 @@ cleanup:
  * Returns a negative error number on failure, or the number of bytes
  * used / required on success.
  */
-int
+static int
 ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
 {
 	int i_error, b_error;
@@ -484,8 +486,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		get_bh(bh);
 		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
 	} else {
-		BHDR(bh)->h_refcount = cpu_to_le32(
-				le32_to_cpu(BHDR(bh)->h_refcount) - 1);
+		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
 		error = ext4_journal_dirty_metadata(handle, bh);
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
@@ -660,7 +661,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext4_xattr_check_block(bs->bh)) {
-			ext4_error(sb, __FUNCTION__,
+			ext4_error(sb, __func__,
 				"inode %lu: bad block %llu", inode->i_ino,
 				EXT4_I(inode)->i_file_acl);
 			error = -EIO;
@@ -738,7 +739,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				ce = NULL;
 			}
 			ea_bdebug(bs->bh, "cloning");
-			s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
 			error = -ENOMEM;
 			if (s->base == NULL)
 				goto cleanup;
@@ -750,7 +751,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
-		s->base = kzalloc(sb->s_blocksize, GFP_KERNEL);
+		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
 		/* assert(header == s->base) */
 		error = -ENOMEM;
 		if (s->base == NULL)
@@ -789,8 +790,7 @@ inserted:
 				if (error)
 					goto cleanup_dquot;
 				lock_buffer(new_bh);
-				BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
-					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					le32_to_cpu(BHDR(new_bh)->h_refcount));
 				unlock_buffer(new_bh);
@@ -808,10 +808,8 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			ext4_fsblk_t goal = le32_to_cpu(
-					EXT4_SB(sb)->s_es->s_first_data_block) +
-				(ext4_fsblk_t)EXT4_I(inode)->i_block_group *
-				EXT4_BLOCKS_PER_GROUP(sb);
+			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+						EXT4_I(inode)->i_block_group);
 			ext4_fsblk_t block = ext4_new_block(handle, inode,
 							goal, &error);
 			if (error)
@@ -863,7 +861,7 @@ cleanup_dquot:
 	goto cleanup;
 
 bad_block:
-	ext4_error(inode->i_sb, __FUNCTION__,
+	ext4_error(inode->i_sb, __func__,
 		   "inode %lu: bad block %llu", inode->i_ino,
 		   EXT4_I(inode)->i_file_acl);
 	goto cleanup;
@@ -1166,7 +1164,7 @@ retry:
 		if (!bh)
 			goto cleanup;
 		if (ext4_xattr_check_block(bh)) {
-			ext4_error(inode->i_sb, __FUNCTION__,
+			ext4_error(inode->i_sb, __func__,
 				"inode %lu: bad block %llu", inode->i_ino,
 				EXT4_I(inode)->i_file_acl);
 			error = -EIO;
@@ -1341,14 +1339,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
-		ext4_error(inode->i_sb, __FUNCTION__,
+		ext4_error(inode->i_sb, __func__,
 			"inode %lu: block %llu read error", inode->i_ino,
 			EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext4_error(inode->i_sb, __FUNCTION__,
+		ext4_error(inode->i_sb, __func__,
 			"inode %lu: bad block %llu", inode->i_ino,
 			EXT4_I(inode)->i_file_acl);
 		goto cleanup;
@@ -1475,7 +1473,7 @@ again:
 		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
-			ext4_error(inode->i_sb, __FUNCTION__,
+			ext4_error(inode->i_sb, __func__,
 				"inode %lu: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index d7f5d6a1265..5992fe979bb 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -74,7 +74,6 @@ extern struct xattr_handler ext4_xattr_security_handler;
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ext4_xattr_list(struct inode *, char *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 
@@ -99,12 +98,6 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 }
 
 static inline int
-ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
 ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 	       const void *value, size_t size, int flags)
 {
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index f17eaf2321b..ca5f89fc6ca 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -6,9 +6,9 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/security.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 
 static size_t
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e0f05acdafe..fff33382cad 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -9,8 +9,8 @@
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 
 #define XATTR_TRUSTED_PREFIX "trusted."
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 7ed3d8ebf09..67be723fcc4 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -8,8 +8,8 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 
 #define XATTR_USER_PREFIX "user."
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 639b3b4f86d..fda25479af2 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,7 +242,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 		/* prevent the infinite loop of cluster chain */
 		if (*fclus > limit) {
 			fat_fs_panic(sb, "%s: detected the cluster chain loop"
-				     " (i_pos %lld)", __FUNCTION__,
+				     " (i_pos %lld)", __func__,
 				     MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
@@ -253,7 +253,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 			goto out;
 		else if (nr == FAT_ENT_FREE) {
 			fat_fs_panic(sb, "%s: invalid cluster chain"
-				     " (i_pos %lld)", __FUNCTION__,
+				     " (i_pos %lld)", __func__,
 				     MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
@@ -286,7 +286,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 		return ret;
 	else if (ret == FAT_ENT_EOF) {
 		fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)",
-			     __FUNCTION__, MSDOS_I(inode)->i_pos);
+			     __func__, MSDOS_I(inode)->i_pos);
 		return -EIO;
 	}
 	return dclus;
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 13ab763cc51..302e95c4af7 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -546,7 +546,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		} else if (cluster == FAT_ENT_FREE) {
 			fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF",
-				     __FUNCTION__);
+				     __func__);
 			err = -EIO;
 			goto error;
 		}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index d604bb13242..27cc1164ec3 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -208,7 +208,7 @@ static int fat_free(struct inode *inode, int skip)
 		} else if (ret == FAT_ENT_FREE) {
 			fat_fs_panic(sb,
 				     "%s: invalid cluster chain (i_pos %lld)",
-				     __FUNCTION__, MSDOS_I(inode)->i_pos);
+				     __func__, MSDOS_I(inode)->i_pos);
 			ret = -EIO;
 		} else if (ret > 0) {
 			err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5f522a55b59..4e0a3dd9d67 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1222,8 +1222,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 		brelse(bh);
 		goto out_invalid;
 	}
-	logical_sector_size =
-		le16_to_cpu(get_unaligned((__le16 *)&b->sector_size));
+	logical_sector_size = get_unaligned_le16(&b->sector_size);
 	if (!is_power_of_2(logical_sector_size)
 	    || (logical_sector_size < 512)
 	    || (logical_sector_size > 4096)) {
@@ -1322,8 +1321,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
 
 	sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
-	sbi->dir_entries =
-		le16_to_cpu(get_unaligned((__le16 *)&b->dir_entries));
+	sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
 	if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
 		if (!silent)
 			printk(KERN_ERR "FAT: bogus directroy-entries per block"
@@ -1335,7 +1333,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	rootdir_sectors = sbi->dir_entries
 		* sizeof(struct msdos_dir_entry) / sb->s_blocksize;
 	sbi->data_start = sbi->dir_start + rootdir_sectors;
-	total_sectors = le16_to_cpu(get_unaligned((__le16 *)&b->sectors));
+	total_sectors = get_unaligned_le16(&b->sectors);
 	if (total_sectors == 0)
 		total_sectors = le32_to_cpu(b->total_sect);
 
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 2b46064f66b..50ab5eecb99 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -50,7 +50,11 @@ extern daddr_t			vxfs_bmap1(struct inode *, long);
 /* vxfs_fshead.c */
 extern int			vxfs_read_fshead(struct super_block *);
 
+/* vxfs_immed.c */
+extern const struct inode_operations vxfs_immed_symlink_iops;
+
 /* vxfs_inode.c */
+extern const struct address_space_operations vxfs_immed_aops;
 extern struct kmem_cache	*vxfs_inode_cachep;
 extern void			vxfs_dumpi(struct vxfs_inode_info *, ino_t);
 extern struct inode *		vxfs_get_fake_inode(struct super_block *,
@@ -69,6 +73,7 @@ extern const struct file_operations	vxfs_dir_operations;
 extern int			vxfs_read_olt(struct super_block *, u_long);
 
 /* vxfs_subr.c */
+extern const struct address_space_operations vxfs_aops;
 extern struct page *		vxfs_get_page(struct address_space *, u_long);
 extern void			vxfs_put_page(struct page *);
 extern struct buffer_head *	vxfs_bread(struct inode *, int);
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 8a5959a61ba..c36aeaf92e4 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 
 #include "vxfs.h"
+#include "vxfs_extern.h"
 #include "vxfs_inode.h"
 
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ad88d2364bc..9f3f2ceb73f 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -41,11 +41,6 @@
 #include "vxfs_extern.h"
 
 
-extern const struct address_space_operations vxfs_aops;
-extern const struct address_space_operations vxfs_immed_aops;
-
-extern const struct inode_operations vxfs_immed_symlink_iops;
-
 struct kmem_cache		*vxfs_inode_cachep;
 
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 06557679ca4..ae45f77765c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,6 +25,45 @@
 #include <linux/buffer_head.h>
 #include "internal.h"
 
+
+/**
+ * writeback_acquire - attempt to get exclusive writeback access to a device
+ * @bdi: the device's backing_dev_info structure
+ *
+ * It is a waste of resources to have more than one pdflush thread blocked on
+ * a single request queue.  Exclusion at the request_queue level is obtained
+ * via a flag in the request_queue's backing_dev_info.state.
+ *
+ * Non-request_queue-backed address_spaces will share default_backing_dev_info,
+ * unless they implement their own.  Which is somewhat inefficient, as this
+ * may prevent concurrent writeback against multiple devices.
+ */
+static int writeback_acquire(struct backing_dev_info *bdi)
+{
+	return !test_and_set_bit(BDI_pdflush, &bdi->state);
+}
+
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @bdi: the device's backing_dev_info structure.
+ *
+ * Determine whether there is writeback in progress against a backing device.
+ */
+int writeback_in_progress(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_pdflush, &bdi->state);
+}
+
+/**
+ * writeback_release - relinquish exclusive writeback access against a device.
+ * @bdi: the device's backing_dev_info structure
+ */
+static void writeback_release(struct backing_dev_info *bdi)
+{
+	BUG_ON(!writeback_in_progress(bdi));
+	clear_bit(BDI_pdflush, &bdi->state);
+}
+
 /**
  *	__mark_inode_dirty -	internal function
  *	@inode: inode to mark
@@ -747,43 +786,4 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 
 	return err;
 }
-
 EXPORT_SYMBOL(generic_osync_inode);
-
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue.  Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own.  Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
- */
-int writeback_acquire(struct backing_dev_info *bdi)
-{
-	return !test_and_set_bit(BDI_pdflush, &bdi->state);
-}
-
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback in progress against a backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
-	return test_bit(BDI_pdflush, &bdi->state);
-}
-
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-void writeback_release(struct backing_dev_info *bdi)
-{
-	BUG_ON(!writeback_in_progress(bdi));
-	clear_bit(BDI_pdflush, &bdi->state);
-}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 105d4a271e0..4f3cab32141 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -117,7 +117,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 
 	parent = fuse_control_sb->s_root;
 	inc_nlink(parent->d_inode);
-	sprintf(name, "%llu", (unsigned long long) fc->id);
+	sprintf(name, "%u", fc->dev);
 	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
 				     &simple_dir_inode_operations,
 				     &simple_dir_operations);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index af639807524..87250b6a868 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void)
 	return req;
 }
 
+struct fuse_req *fuse_request_alloc_nofs(void)
+{
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
+	if (req)
+		fuse_request_init(req);
+	return req;
+}
+
 void fuse_request_free(struct fuse_req *req)
 {
 	kmem_cache_free(fuse_req_cachep, req);
@@ -291,6 +299,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 
 static void wait_answer_interruptible(struct fuse_conn *fc,
 				      struct fuse_req *req)
+	__releases(fc->lock) __acquires(fc->lock)
 {
 	if (signal_pending(current))
 		return;
@@ -307,8 +316,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
-/* Called with fc->lock held.  Releases, and then reacquires it. */
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+	__releases(fc->lock) __acquires(fc->lock)
 {
 	if (!fc->no_interrupt) {
 		/* Any signal may interrupt this */
@@ -430,6 +439,17 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 /*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	request_send_nowait_locked(fc, req);
+}
+
+/*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
@@ -968,6 +988,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
  * locked).
  */
 static void end_io_requests(struct fuse_conn *fc)
+	__releases(fc->lock) __acquires(fc->lock)
 {
 	while (!list_empty(&fc->io)) {
 		struct fuse_req *req =
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c4807b3fc8a..2060bf06b90 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -132,7 +132,7 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 	req->out.args[0].value = outarg;
 }
 
-static u64 fuse_get_attr_version(struct fuse_conn *fc)
+u64 fuse_get_attr_version(struct fuse_conn *fc)
 {
 	u64 curr_version;
 
@@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 }
 
 /*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	BUG_ON(!mutex_is_locked(&inode->i_mutex));
+
+	spin_lock(&fc->lock);
+	BUG_ON(fi->writectr < 0);
+	fi->writectr += FUSE_NOWRITE;
+	spin_unlock(&fc->lock);
+	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
+}
+
+/*
+ * Allow writepages on inode
+ *
+ * Remove the bias from the writecounter and send any queued
+ * writepages.
+ */
+static void __fuse_release_nowrite(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	BUG_ON(fi->writectr != FUSE_NOWRITE);
+	fi->writectr = 0;
+	fuse_flush_writepages(inode);
+}
+
+void fuse_release_nowrite(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	spin_lock(&fc->lock);
+	__fuse_release_nowrite(inode);
+	spin_unlock(&fc->lock);
+}
+
+/*
  * Set attributes, and at the same time refresh them.
  *
  * Truncation is slightly complicated, because the 'truncate' request
@@ -1122,6 +1166,8 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	struct fuse_req *req;
 	struct fuse_setattr_in inarg;
 	struct fuse_attr_out outarg;
+	bool is_truncate = false;
+	loff_t oldsize;
 	int err;
 
 	if (!fuse_allow_task(fc, current))
@@ -1145,12 +1191,16 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 			send_sig(SIGXFSZ, current, 0);
 			return -EFBIG;
 		}
+		is_truncate = true;
 	}
 
 	req = fuse_get_req(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	if (is_truncate)
+		fuse_set_nowrite(inode);
+
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outarg, 0, sizeof(outarg));
 	iattr_to_fattr(attr, &inarg);
@@ -1181,16 +1231,44 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	if (err) {
 		if (err == -EINTR)
 			fuse_invalidate_attr(inode);
-		return err;
+		goto error;
 	}
 
 	if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
 		make_bad_inode(inode);
-		return -EIO;
+		err = -EIO;
+		goto error;
+	}
+
+	spin_lock(&fc->lock);
+	fuse_change_attributes_common(inode, &outarg.attr,
+				      attr_timeout(&outarg));
+	oldsize = inode->i_size;
+	i_size_write(inode, outarg.attr.size);
+
+	if (is_truncate) {
+		/* NOTE: this may release/reacquire fc->lock */
+		__fuse_release_nowrite(inode);
+	}
+	spin_unlock(&fc->lock);
+
+	/*
+	 * Only call invalidate_inode_pages2() after removing
+	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+	 */
+	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+		if (outarg.attr.size < oldsize)
+			fuse_truncate(inode->i_mapping, outarg.attr.size);
+		invalidate_inode_pages2(inode->i_mapping);
 	}
 
-	fuse_change_attributes(inode, &outarg.attr, attr_timeout(&outarg), 0);
 	return 0;
+
+error:
+	if (is_truncate)
+		fuse_release_nowrite(inode);
+
+	return err;
 }
 
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 676b0bc8a86..9ced35b0068 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -210,6 +210,49 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
+/*
+ * Check if page is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (curr_index == index) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
+/*
+ * Wait for page writeback to be completed.
+ *
+ * Since fuse doesn't rely on the VM writeback tracking, this has to
+ * use some other means.
+ */
+static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
+	return 0;
+}
+
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
@@ -245,6 +288,21 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	return err;
 }
 
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+	fuse_set_nowrite(inode);
+	fuse_release_nowrite(inode);
+}
+
 int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 		      int isdir)
 {
@@ -261,6 +319,17 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
+	/*
+	 * Start writeback against all dirty pages of the inode, then
+	 * wait for all outstanding writes, before sending the FSYNC
+	 * request.
+	 */
+	err = write_inode_now(inode, 0);
+	if (err)
+		return err;
+
+	fuse_sync_writes(inode);
+
 	req = fuse_get_req(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -294,7 +363,7 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
 void fuse_read_fill(struct fuse_req *req, struct file *file,
 		    struct inode *inode, loff_t pos, size_t count, int opcode)
 {
-	struct fuse_read_in *inarg = &req->misc.read_in;
+	struct fuse_read_in *inarg = &req->misc.read.in;
 	struct fuse_file *ff = file->private_data;
 
 	inarg->fh = ff->fh;
@@ -320,7 +389,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
 
 	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
 	if (owner != NULL) {
-		struct fuse_read_in *inarg = &req->misc.read_in;
+		struct fuse_read_in *inarg = &req->misc.read.in;
 
 		inarg->read_flags |= FUSE_READ_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
@@ -329,31 +398,66 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
 	return req->out.args[0].size;
 }
 
+static void fuse_read_update_size(struct inode *inode, loff_t size,
+				  u64 attr_ver)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fc->lock);
+	if (attr_ver == fi->attr_version && size < inode->i_size) {
+		fi->attr_version = ++fc->attr_version;
+		i_size_write(inode, size);
+	}
+	spin_unlock(&fc->lock);
+}
+
 static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
+	size_t num_read;
+	loff_t pos = page_offset(page);
+	size_t count = PAGE_CACHE_SIZE;
+	u64 attr_ver;
 	int err;
 
 	err = -EIO;
 	if (is_bad_inode(inode))
 		goto out;
 
+	/*
+	 * Page writeback can extend beyond the liftime of the
+	 * page-cache page, so make sure we read a properly synced
+	 * page.
+	 */
+	fuse_wait_on_page_writeback(inode, page->index);
+
 	req = fuse_get_req(fc);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
 
+	attr_ver = fuse_get_attr_version(fc);
+
 	req->out.page_zeroing = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE,
-		       NULL);
+	num_read = fuse_send_read(req, file, inode, pos, count, NULL);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
-	if (!err)
+
+	if (!err) {
+		/*
+		 * Short read means EOF.  If file size is larger, truncate it
+		 */
+		if (num_read < count)
+			fuse_read_update_size(inode, pos + num_read, attr_ver);
+
 		SetPageUptodate(page);
+	}
+
 	fuse_invalidate_attr(inode); /* atime changed */
  out:
 	unlock_page(page);
@@ -363,8 +467,19 @@ static int fuse_readpage(struct file *file, struct page *page)
 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int i;
+	size_t count = req->misc.read.in.size;
+	size_t num_read = req->out.args[0].size;
+	struct inode *inode = req->pages[0]->mapping->host;
+
+	/*
+	 * Short read means EOF.  If file size is larger, truncate it
+	 */
+	if (!req->out.h.error && num_read < count) {
+		loff_t pos = page_offset(req->pages[0]) + num_read;
+		fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
+	}
 
-	fuse_invalidate_attr(req->pages[0]->mapping->host); /* atime changed */
+	fuse_invalidate_attr(inode); /* atime changed */
 
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -387,6 +502,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
 	req->out.page_zeroing = 1;
 	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+	req->misc.read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
 		struct fuse_file *ff = file->private_data;
 		req->ff = fuse_file_get(ff);
@@ -411,6 +527,8 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	struct inode *inode = data->inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
+	fuse_wait_on_page_writeback(inode, page->index);
+
 	if (req->num_pages &&
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
@@ -477,11 +595,10 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 }
 
 static void fuse_write_fill(struct fuse_req *req, struct file *file,
-			    struct inode *inode, loff_t pos, size_t count,
-			    int writepage)
+			    struct fuse_file *ff, struct inode *inode,
+			    loff_t pos, size_t count, int writepage)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_file *ff = file->private_data;
 	struct fuse_write_in *inarg = &req->misc.write.in;
 	struct fuse_write_out *outarg = &req->misc.write.out;
 
@@ -490,7 +607,7 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
 	inarg->offset = pos;
 	inarg->size = count;
 	inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
-	inarg->flags = file->f_flags;
+	inarg->flags = file ? file->f_flags : 0;
 	req->in.h.opcode = FUSE_WRITE;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.argpages = 1;
@@ -511,7 +628,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 			      fl_owner_t owner)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	fuse_write_fill(req, file, inode, pos, count, 0);
+	fuse_write_fill(req, file, file->private_data, inode, pos, count, 0);
 	if (owner != NULL) {
 		struct fuse_write_in *inarg = &req->misc.write.in;
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
@@ -533,19 +650,36 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
+static void fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fc->lock);
+	fi->attr_version = ++fc->attr_version;
+	if (pos > inode->i_size)
+		i_size_write(inode, pos);
+	spin_unlock(&fc->lock);
+}
+
 static int fuse_buffered_write(struct file *file, struct inode *inode,
 			       loff_t pos, unsigned count, struct page *page)
 {
 	int err;
 	size_t nres;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_inode *fi = get_fuse_inode(inode);
 	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
 	struct fuse_req *req;
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
+	/*
+	 * Make sure writepages on the same page are not mixed up with
+	 * plain writes.
+	 */
+	fuse_wait_on_page_writeback(inode, page->index);
+
 	req = fuse_get_req(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -560,12 +694,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
 		err = -EIO;
 	if (!err) {
 		pos += nres;
-		spin_lock(&fc->lock);
-		fi->attr_version = ++fc->attr_version;
-		if (pos > inode->i_size)
-			i_size_write(inode, pos);
-		spin_unlock(&fc->lock);
-
+		fuse_write_update_size(inode, pos);
 		if (count == PAGE_CACHE_SIZE)
 			SetPageUptodate(page);
 	}
@@ -588,6 +717,198 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
 	return res;
 }
 
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+				    struct inode *inode, loff_t pos,
+				    size_t count)
+{
+	size_t res;
+	unsigned offset;
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++)
+		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+
+	res = fuse_send_write(req, file, inode, pos, count, NULL);
+
+	offset = req->page_offset;
+	count = res;
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+
+		if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+			SetPageUptodate(page);
+
+		if (count > PAGE_CACHE_SIZE - offset)
+			count -= PAGE_CACHE_SIZE - offset;
+		else
+			count = 0;
+		offset = 0;
+
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+			       struct address_space *mapping,
+			       struct iov_iter *ii, loff_t pos)
+{
+	struct fuse_conn *fc = get_fuse_conn(mapping->host);
+	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	size_t count = 0;
+	int err;
+
+	req->page_offset = offset;
+
+	do {
+		size_t tmp;
+		struct page *page;
+		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+		size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+				     iov_iter_count(ii));
+
+		bytes = min_t(size_t, bytes, fc->max_write - count);
+
+ again:
+		err = -EFAULT;
+		if (iov_iter_fault_in_readable(ii, bytes))
+			break;
+
+		err = -ENOMEM;
+		page = __grab_cache_page(mapping, index);
+		if (!page)
+			break;
+
+		pagefault_disable();
+		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+		pagefault_enable();
+		flush_dcache_page(page);
+
+		if (!tmp) {
+			unlock_page(page);
+			page_cache_release(page);
+			bytes = min(bytes, iov_iter_single_seg_count(ii));
+			goto again;
+		}
+
+		err = 0;
+		req->pages[req->num_pages] = page;
+		req->num_pages++;
+
+		iov_iter_advance(ii, tmp);
+		count += tmp;
+		pos += tmp;
+		offset += tmp;
+		if (offset == PAGE_CACHE_SIZE)
+			offset = 0;
+
+	} while (iov_iter_count(ii) && count < fc->max_write &&
+		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+
+	return count > 0 ? count : err;
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+				  struct address_space *mapping,
+				  struct iov_iter *ii, loff_t pos)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err = 0;
+	ssize_t res = 0;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	do {
+		struct fuse_req *req;
+		ssize_t count;
+
+		req = fuse_get_req(fc);
+		if (IS_ERR(req)) {
+			err = PTR_ERR(req);
+			break;
+		}
+
+		count = fuse_fill_write_pages(req, mapping, ii, pos);
+		if (count <= 0) {
+			err = count;
+		} else {
+			size_t num_written;
+
+			num_written = fuse_send_write_pages(req, file, inode,
+							    pos, count);
+			err = req->out.h.error;
+			if (!err) {
+				res += num_written;
+				pos += num_written;
+
+				/* break out of the loop on short write */
+				if (num_written != count)
+					err = -EIO;
+			}
+		}
+		fuse_put_request(fc, req);
+	} while (!err && iov_iter_count(ii));
+
+	if (res > 0)
+		fuse_write_update_size(inode, pos);
+
+	fuse_invalidate_attr(inode);
+
+	return res > 0 ? res : err;
+}
+
+static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	size_t count = 0;
+	ssize_t written = 0;
+	struct inode *inode = mapping->host;
+	ssize_t err;
+	struct iov_iter i;
+
+	WARN_ON(iocb->ki_pos != pos);
+
+	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	err = remove_suid(file->f_path.dentry);
+	if (err)
+		goto out;
+
+	file_update_time(file);
+
+	iov_iter_init(&i, iov, nr_segs, count, 0);
+	written = fuse_perform_write(file, mapping, &i, pos);
+	if (written >= 0)
+		iocb->ki_pos = pos + written;
+
+out:
+	current->backing_dev_info = NULL;
+	mutex_unlock(&inode->i_mutex);
+
+	return written ? written : err;
+}
+
 static void fuse_release_user_pages(struct fuse_req *req, int write)
 {
 	unsigned i;
@@ -645,14 +966,15 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 
 	while (count) {
 		size_t nres;
-		size_t nbytes = min(count, nmax);
-		int err = fuse_get_user_pages(req, buf, nbytes, !write);
+		size_t nbytes_limit = min(count, nmax);
+		size_t nbytes;
+		int err = fuse_get_user_pages(req, buf, nbytes_limit, !write);
 		if (err) {
 			res = err;
 			break;
 		}
 		nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-		nbytes = min(count, nbytes);
+		nbytes = min(nbytes_limit, nbytes);
 		if (write)
 			nres = fuse_send_write(req, file, inode, pos, nbytes,
 					       current->files);
@@ -683,12 +1005,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	}
 	fuse_put_request(fc, req);
 	if (res > 0) {
-		if (write) {
-			spin_lock(&fc->lock);
-			if (pos > inode->i_size)
-				i_size_write(inode, pos);
-			spin_unlock(&fc->lock);
-		}
+		if (write)
+			fuse_write_update_size(inode, pos);
 		*ppos = pos;
 	}
 	fuse_invalidate_attr(inode);
@@ -716,21 +1034,225 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 	return res;
 }
 
-static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
-	if ((vma->vm_flags & VM_SHARED)) {
-		if ((vma->vm_flags & VM_WRITE))
-			return -ENODEV;
-		else
-			vma->vm_flags &= ~VM_MAYWRITE;
+	__free_page(req->pages[0]);
+	fuse_file_put(req->ff);
+	fuse_put_request(fc, req);
+}
+
+static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct inode *inode = req->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+
+	list_del(&req->writepages_entry);
+	dec_bdi_stat(bdi, BDI_WRITEBACK);
+	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
+	bdi_writeout_inc(bdi);
+	wake_up(&fi->page_waitq);
+}
+
+/* Called under fc->lock, may release and reacquire it */
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_inode *fi = get_fuse_inode(req->inode);
+	loff_t size = i_size_read(req->inode);
+	struct fuse_write_in *inarg = &req->misc.write.in;
+
+	if (!fc->connected)
+		goto out_free;
+
+	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
+		inarg->size = PAGE_CACHE_SIZE;
+	} else if (inarg->offset < size) {
+		inarg->size = size & (PAGE_CACHE_SIZE - 1);
+	} else {
+		/* Got truncated off completely */
+		goto out_free;
+	}
+
+	req->in.args[1].size = inarg->size;
+	fi->writectr++;
+	request_send_background_locked(fc, req);
+	return;
+
+ out_free:
+	fuse_writepage_finish(fc, req);
+	spin_unlock(&fc->lock);
+	fuse_writepage_free(fc, req);
+	spin_lock(&fc->lock);
+}
+
+/*
+ * If fi->writectr is positive (no truncate or fsync going on) send
+ * all queued writepage requests.
+ *
+ * Called with fc->lock
+ */
+void fuse_flush_writepages(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+
+	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
+		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
+		list_del_init(&req->list);
+		fuse_send_writepage(fc, req);
+	}
+}
+
+static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct inode *inode = req->inode;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	mapping_set_error(inode->i_mapping, req->out.h.error);
+	spin_lock(&fc->lock);
+	fi->writectr--;
+	fuse_writepage_finish(fc, req);
+	spin_unlock(&fc->lock);
+	fuse_writepage_free(fc, req);
+}
+
+static int fuse_writepage_locked(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	struct fuse_file *ff;
+	struct page *tmp_page;
+
+	set_page_writeback(page);
+
+	req = fuse_request_alloc_nofs();
+	if (!req)
+		goto err;
+
+	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (!tmp_page)
+		goto err_free;
+
+	spin_lock(&fc->lock);
+	BUG_ON(list_empty(&fi->write_files));
+	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+	req->ff = fuse_file_get(ff);
+	spin_unlock(&fc->lock);
+
+	fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
+
+	copy_highpage(tmp_page, page);
+	req->num_pages = 1;
+	req->pages[0] = tmp_page;
+	req->page_offset = 0;
+	req->end = fuse_writepage_end;
+	req->inode = inode;
+
+	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+	end_page_writeback(page);
+
+	spin_lock(&fc->lock);
+	list_add(&req->writepages_entry, &fi->writepages);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(inode);
+	spin_unlock(&fc->lock);
+
+	return 0;
+
+err_free:
+	fuse_request_free(req);
+err:
+	end_page_writeback(page);
+	return -ENOMEM;
+}
+
+static int fuse_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+
+	err = fuse_writepage_locked(page);
+	unlock_page(page);
+
+	return err;
+}
+
+static int fuse_launder_page(struct page *page)
+{
+	int err = 0;
+	if (clear_page_dirty_for_io(page)) {
+		struct inode *inode = page->mapping->host;
+		err = fuse_writepage_locked(page);
+		if (!err)
+			fuse_wait_on_page_writeback(inode, page->index);
 	}
-	return generic_file_mmap(file, vma);
+	return err;
 }
 
-static int fuse_set_page_dirty(struct page *page)
+/*
+ * Write back dirty pages now, because there may not be any suitable
+ * open files later
+ */
+static void fuse_vma_close(struct vm_area_struct *vma)
 {
-	printk("fuse_set_page_dirty: should not happen\n");
-	dump_stack();
+	filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+/*
+ * Wait for writeback against this page to complete before allowing it
+ * to be marked dirty again, and hence written back again, possibly
+ * before the previous writepage completed.
+ *
+ * Block here, instead of in ->writepage(), so that the userspace fs
+ * can only block processes actually operating on the filesystem.
+ *
+ * Otherwise unprivileged userspace fs would be able to block
+ * unrelated:
+ *
+ * - page migration
+ * - sync(2)
+ * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
+ */
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	/*
+	 * Don't use page->mapping as it may become NULL from a
+	 * concurrent truncate.
+	 */
+	struct inode *inode = vma->vm_file->f_mapping->host;
+
+	fuse_wait_on_page_writeback(inode, page->index);
+	return 0;
+}
+
+static struct vm_operations_struct fuse_file_vm_ops = {
+	.close		= fuse_vma_close,
+	.fault		= filemap_fault,
+	.page_mkwrite	= fuse_page_mkwrite,
+};
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+		struct inode *inode = file->f_dentry->d_inode;
+		struct fuse_conn *fc = get_fuse_conn(inode);
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		struct fuse_file *ff = file->private_data;
+		/*
+		 * file may be written through mmap, so chain it onto the
+		 * inodes's write_file list
+		 */
+		spin_lock(&fc->lock);
+		if (list_empty(&ff->write_entry))
+			list_add(&ff->write_entry, &fi->write_files);
+		spin_unlock(&fc->lock);
+	}
+	file_accessed(file);
+	vma->vm_ops = &fuse_file_vm_ops;
 	return 0;
 }
 
@@ -909,12 +1431,37 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 	return err ? 0 : outarg.block;
 }
 
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t retval;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += i_size_read(inode);
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+		}
+		retval = offset;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
 static const struct file_operations fuse_file_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
 	.aio_read	= fuse_file_aio_read,
 	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
+	.aio_write	= fuse_file_aio_write,
 	.mmap		= fuse_file_mmap,
 	.open		= fuse_open,
 	.flush		= fuse_flush,
@@ -926,7 +1473,7 @@ static const struct file_operations fuse_file_operations = {
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
-	.llseek		= generic_file_llseek,
+	.llseek		= fuse_file_llseek,
 	.read		= fuse_direct_read,
 	.write		= fuse_direct_write,
 	.open		= fuse_open,
@@ -940,10 +1487,12 @@ static const struct file_operations fuse_direct_io_file_operations = {
 
 static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
+	.writepage	= fuse_writepage,
+	.launder_page	= fuse_launder_page,
 	.write_begin	= fuse_write_begin,
 	.write_end	= fuse_write_end,
 	.readpages	= fuse_readpages,
-	.set_page_dirty	= fuse_set_page_dirty,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.bmap		= fuse_bmap,
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 67aaf6ee38e..dadffa21a20 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -25,6 +26,9 @@
 /** Congestion starts at 75% of maximum */
 #define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
 
+/** Bias for fi->writectr, meaning new writepages must not be sent */
+#define FUSE_NOWRITE INT_MIN
+
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
@@ -73,6 +77,19 @@ struct fuse_inode {
 
 	/** Files usable in writepage.  Protected by fc->lock */
 	struct list_head write_files;
+
+	/** Writepages pending on truncate or fsync */
+	struct list_head queued_writes;
+
+	/** Number of sent writes, a negative bias (FUSE_NOWRITE)
+	 * means more writes are blocked */
+	int writectr;
+
+	/** Waitq for writepage completion */
+	wait_queue_head_t page_waitq;
+
+	/** List of writepage requestst (pending or sent) */
+	struct list_head writepages;
 };
 
 /** FUSE specific file data */
@@ -222,7 +239,10 @@ struct fuse_req {
 		} release;
 		struct fuse_init_in init_in;
 		struct fuse_init_out init_out;
-		struct fuse_read_in read_in;
+		struct {
+			struct fuse_read_in in;
+			u64 attr_ver;
+		} read;
 		struct {
 			struct fuse_write_in in;
 			struct fuse_write_out out;
@@ -242,6 +262,12 @@ struct fuse_req {
 	/** File used in the request (or NULL) */
 	struct fuse_file *ff;
 
+	/** Inode used in the request or NULL */
+	struct inode *inode;
+
+	/** Link on fi->writepages */
+	struct list_head writepages_entry;
+
 	/** Request completion callback */
 	void (*end)(struct fuse_conn *, struct fuse_req *);
 
@@ -390,8 +416,8 @@ struct fuse_conn {
 	/** Entry on the fuse_conn_list */
 	struct list_head entry;
 
-	/** Unique ID */
-	u64 id;
+	/** Device ID from super block */
+	dev_t dev;
 
 	/** Dentries in the control filesystem */
 	struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
@@ -438,7 +464,7 @@ extern const struct file_operations fuse_dev_operations;
 /**
  * Get a filled in inode
  */
-struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
 			u64 attr_valid, u64 attr_version);
 
@@ -446,7 +472,7 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
  * Send FORGET command
  */
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-		      unsigned long nodeid, u64 nlookup);
+		      u64 nodeid, u64 nlookup);
 
 /**
  * Initialize READ or READDIR request
@@ -504,6 +530,11 @@ void fuse_init_symlink(struct inode *inode);
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 			    u64 attr_valid, u64 attr_version);
 
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+				   u64 attr_valid);
+
+void fuse_truncate(struct address_space *mapping, loff_t offset);
+
 /**
  * Initialize the client device
  */
@@ -522,6 +553,8 @@ void fuse_ctl_cleanup(void);
  */
 struct fuse_req *fuse_request_alloc(void);
 
+struct fuse_req *fuse_request_alloc_nofs(void);
+
 /**
  * Free a request
  */
@@ -558,6 +591,8 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
  */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
+void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
@@ -600,3 +635,10 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed);
+
+void fuse_flush_writepages(struct inode *inode);
+
+void fuse_set_nowrite(struct inode *inode);
+void fuse_release_nowrite(struct inode *inode);
+
+u64 fuse_get_attr_version(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4df34da2284..79b61587383 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -59,7 +59,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->nodeid = 0;
 	fi->nlookup = 0;
 	fi->attr_version = 0;
+	fi->writectr = 0;
 	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->queued_writes);
+	INIT_LIST_HEAD(&fi->writepages);
+	init_waitqueue_head(&fi->page_waitq);
 	fi->forget_req = fuse_request_alloc();
 	if (!fi->forget_req) {
 		kmem_cache_free(fuse_inode_cachep, inode);
@@ -73,13 +77,14 @@ static void fuse_destroy_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	BUG_ON(!list_empty(&fi->write_files));
+	BUG_ON(!list_empty(&fi->queued_writes));
 	if (fi->forget_req)
 		fuse_request_free(fi->forget_req);
 	kmem_cache_free(fuse_inode_cachep, inode);
 }
 
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
-		      unsigned long nodeid, u64 nlookup)
+		      u64 nodeid, u64 nlookup)
 {
 	struct fuse_forget_in *inarg = &req->misc.forget_in;
 	inarg->nlookup = nlookup;
@@ -109,7 +114,7 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static void fuse_truncate(struct address_space *mapping, loff_t offset)
+void fuse_truncate(struct address_space *mapping, loff_t offset)
 {
 	/* See vmtruncate() */
 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
@@ -117,19 +122,12 @@ static void fuse_truncate(struct address_space *mapping, loff_t offset)
 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
 }
 
-
-void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
-			    u64 attr_valid, u64 attr_version)
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+				   u64 attr_valid)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	loff_t oldsize;
 
-	spin_lock(&fc->lock);
-	if (attr_version != 0 && fi->attr_version > attr_version) {
-		spin_unlock(&fc->lock);
-		return;
-	}
 	fi->attr_version = ++fc->attr_version;
 	fi->i_time = attr_valid;
 
@@ -159,6 +157,22 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	fi->orig_i_mode = inode->i_mode;
 	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
 		inode->i_mode &= ~S_ISVTX;
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+			    u64 attr_valid, u64 attr_version)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	loff_t oldsize;
+
+	spin_lock(&fc->lock);
+	if (attr_version != 0 && fi->attr_version > attr_version) {
+		spin_unlock(&fc->lock);
+		return;
+	}
+
+	fuse_change_attributes_common(inode, attr, attr_valid);
 
 	oldsize = inode->i_size;
 	i_size_write(inode, attr->size);
@@ -193,7 +207,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 
 static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 {
-	unsigned long nodeid = *(unsigned long *) _nodeidp;
+	u64 nodeid = *(u64 *) _nodeidp;
 	if (get_node_id(inode) == nodeid)
 		return 1;
 	else
@@ -202,12 +216,12 @@ static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 
 static int fuse_inode_set(struct inode *inode, void *_nodeidp)
 {
-	unsigned long nodeid = *(unsigned long *) _nodeidp;
+	u64 nodeid = *(u64 *) _nodeidp;
 	get_fuse_inode(inode)->nodeid = nodeid;
 	return 0;
 }
 
-struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
 			u64 attr_valid, u64 attr_version)
 {
@@ -447,7 +461,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
-static struct fuse_conn *new_conn(void)
+static struct fuse_conn *new_conn(struct super_block *sb)
 {
 	struct fuse_conn *fc;
 	int err;
@@ -468,19 +482,41 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		/* fuse does it's own writeback accounting */
+		fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+		fc->dev = sb->s_dev;
 		err = bdi_init(&fc->bdi);
-		if (err) {
-			kfree(fc);
-			fc = NULL;
-			goto out;
-		}
+		if (err)
+			goto error_kfree;
+		err = bdi_register_dev(&fc->bdi, fc->dev);
+		if (err)
+			goto error_bdi_destroy;
+		/*
+		 * For a single fuse filesystem use max 1% of dirty +
+		 * writeback threshold.
+		 *
+		 * This gives about 1M of write buffer for memory maps on a
+		 * machine with 1G and 10% dirty_ratio, which should be more
+		 * than enough.
+		 *
+		 * Privileged users can raise it by writing to
+		 *
+		 *    /sys/class/bdi/<bdi>/max_ratio
+		 */
+		bdi_set_max_ratio(&fc->bdi, 1);
 		fc->reqctr = 0;
 		fc->blocked = 1;
 		fc->attr_version = 1;
 		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 	}
-out:
 	return fc;
+
+error_bdi_destroy:
+	bdi_destroy(&fc->bdi);
+error_kfree:
+	mutex_destroy(&fc->inst_mutex);
+	kfree(fc);
+	return NULL;
 }
 
 void fuse_conn_put(struct fuse_conn *fc)
@@ -548,6 +584,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+		fc->max_write = min_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
 	fuse_put_request(fc, req);
@@ -578,12 +615,6 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	request_send_background(fc, req);
 }
 
-static u64 conn_id(void)
-{
-	static u64 ctr = 1;
-	return ctr++;
-}
-
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct fuse_conn *fc;
@@ -621,14 +652,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (file->f_op != &fuse_dev_operations)
 		return -EINVAL;
 
-	fc = new_conn();
+	fc = new_conn(sb);
 	if (!fc)
 		return -ENOMEM;
 
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
-	fc->max_read = d.max_read;
+	fc->max_read = min_t(unsigned, 4096, d.max_read);
 
 	/* Used by get_root_inode() */
 	sb->s_fs_info = fc;
@@ -659,7 +690,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (file->private_data)
 		goto err_unlock;
 
-	fc->id = conn_id();
 	err = fuse_ctl_add_conn(fc);
 	if (err)
 		goto err_unlock;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 8479da47049..a4ff271df9e 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -212,7 +212,7 @@ int gdlm_sysfs_init(void)
 {
 	gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
 	if (!gdlm_kset) {
-		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
+		printk(KERN_WARNING "%s: can not create kset\n", __func__);
 		return -ENOMEM;
 	}
 	return 0;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 509c5d60bd8..7f48576289c 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -41,7 +41,7 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
 
 #define gfs2_assert_withdraw(sdp, assertion) \
 ((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
-					__FUNCTION__, __FILE__, __LINE__))
+					__func__, __FILE__, __LINE__))
 
 
 int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
@@ -49,28 +49,28 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
 
 #define gfs2_assert_warn(sdp, assertion) \
 ((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
-					__FUNCTION__, __FILE__, __LINE__))
+					__func__, __FILE__, __LINE__))
 
 
 int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
 		   const char *function, char *file, unsigned int line);
 
 #define gfs2_consist(sdp) \
-gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
+gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__)
 
 
 int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
 			 const char *function, char *file, unsigned int line);
 
 #define gfs2_consist_inode(ip) \
-gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
+gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__)
 
 
 int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
 			 const char *function, char *file, unsigned int line);
 
 #define gfs2_consist_rgrpd(rgd) \
-gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
+gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__)
 
 
 int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
@@ -91,7 +91,7 @@ static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
 }
 
 #define gfs2_meta_check(sdp, bh) \
-gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
+gfs2_meta_check_i((sdp), (bh), __func__, __FILE__, __LINE__)
 
 
 int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
@@ -118,7 +118,7 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
 }
 
 #define gfs2_metatype_check(sdp, bh, type) \
-gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
+gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__)
 
 static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
 				     u16 format)
@@ -134,14 +134,14 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
 		    char *file, unsigned int line);
 
 #define gfs2_io_error(sdp) \
-gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
+gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__);
 
 
 int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		       const char *function, char *file, unsigned int line);
 
 #define gfs2_io_error_bh(sdp, bh) \
-gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
+gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
 
 
 extern struct kmem_cache *gfs2_glock_cachep;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 24cf6fc4302..f6621a78520 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -208,7 +208,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	struct hfs_bnode *node, *next_node;
 	struct page **pagep;
 	u32 nidx, idx;
-	u16 off, len;
+	unsigned off;
+	u16 off16;
+	u16 len;
 	u8 *data, byte, m;
 	int i;
 
@@ -235,7 +237,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	node = hfs_bnode_find(tree, nidx);
 	if (IS_ERR(node))
 		return node;
-	len = hfs_brec_lenoff(node, 2, &off);
+	len = hfs_brec_lenoff(node, 2, &off16);
+	off = off16;
 
 	off += node->page_offset;
 	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
@@ -280,7 +283,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 			return next_node;
 		node = next_node;
 
-		len = hfs_brec_lenoff(node, 0, &off);
+		len = hfs_brec_lenoff(node, 0, &off16);
+		off = off16;
 		off += node->page_offset;
 		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
 		data = kmap(*pagep);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index b4651e128d7..36ca2e1a4fa 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -215,7 +215,7 @@ int hfs_mdb_get(struct super_block *sb)
 		attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT);
 		attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT);
 		mdb->drAtrb = attrib;
-		mdb->drWrCnt = cpu_to_be32(be32_to_cpu(mdb->drWrCnt) + 1);
+		be32_add_cpu(&mdb->drWrCnt, 1);
 		mdb->drLsMod = hfs_mtime();
 
 		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 32de44ed002..8cf67974adf 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -297,7 +297,8 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 				return 0;
 			}
 			p = match_strdup(&args[0]);
-			hsb->nls_disk = load_nls(p);
+			if (p)
+				hsb->nls_disk = load_nls(p);
 			if (!hsb->nls_disk) {
 				printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p);
 				kfree(p);
@@ -311,7 +312,8 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 				return 0;
 			}
 			p = match_strdup(&args[0]);
-			hsb->nls_io = load_nls(p);
+			if (p)
+				hsb->nls_io = load_nls(p);
 			if (!hsb->nls_io) {
 				printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p);
 				kfree(p);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index bb5433608a4..e49fcee1e29 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -184,7 +184,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	struct hfs_bnode *node, *next_node;
 	struct page **pagep;
 	u32 nidx, idx;
-	u16 off, len;
+	unsigned off;
+	u16 off16;
+	u16 len;
 	u8 *data, byte, m;
 	int i;
 
@@ -211,7 +213,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	node = hfs_bnode_find(tree, nidx);
 	if (IS_ERR(node))
 		return node;
-	len = hfs_brec_lenoff(node, 2, &off);
+	len = hfs_brec_lenoff(node, 2, &off16);
+	off = off16;
 
 	off += node->page_offset;
 	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
@@ -256,7 +259,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 			return next_node;
 		node = next_node;
 
-		len = hfs_brec_lenoff(node, 0, &off);
+		len = hfs_brec_lenoff(node, 0, &off16);
+		off = off16;
 		off += node->page_offset;
 		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
 		data = kmap(*pagep);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d72d0a8b25a..9e59537b43d 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -311,6 +311,10 @@ int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
 int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
 		       struct inode *, struct qstr *);
 
+/* dir.c */
+extern const struct inode_operations hfsplus_dir_inode_operations;
+extern const struct file_operations hfsplus_dir_operations;
+
 /* extents.c */
 int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
 void hfsplus_ext_write_extent(struct inode *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37744cf3706..d53b2af91c2 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -278,9 +278,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-extern const struct inode_operations hfsplus_dir_inode_operations;
-extern struct file_operations hfsplus_dir_operations;
-
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index dc64fac0083..9997cbf8beb 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -132,7 +132,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 				return 0;
 			}
 			p = match_strdup(&args[0]);
-			sbi->nls = load_nls(p);
+			if (p)
+				sbi->nls = load_nls(p);
 			if (!sbi->nls) {
 				printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p);
 				kfree(p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index b0f9ad362d1..ce97a54518d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -357,7 +357,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
-		printk(KERN_WARNING "hfs: write access to a jounaled filesystem is not supported, "
+		printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
 		       "use the force option at your own risk, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
 	vhdr->modify_date = hfsp_now2mt();
-	vhdr->write_count = cpu_to_be32(be32_to_cpu(vhdr->write_count) + 1);
+	be32_add_cpu(&vhdr->write_count, 1);
 	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
 	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
 	mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 72cab78f050..175d08eacc8 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -47,7 +47,7 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
 		return 0;
 	wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
 
-	extent = be32_to_cpu(get_unaligned((__be32 *)(bufptr + HFSP_WRAPOFF_EMBEDEXT)));
+	extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
 	wd->embed_start = (extent >> 16) & 0xFFFF;
 	wd->embed_count = extent & 0xFFFF;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9783723e8ff..aeabf80f81a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,7 +45,7 @@ static const struct inode_operations hugetlbfs_inode_operations;
 
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 int sysctl_hugetlb_shm_group;
diff --git a/fs/inode.c b/fs/inode.c
index 27ee1af50d0..bf647813042 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -495,8 +495,7 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea
 	struct inode * inode = NULL;
 
 repeat:
-	hlist_for_each (node, head) { 
-		inode = hlist_entry(node, struct inode, i_hash);
+	hlist_for_each_entry(inode, node, head, i_hash) {
 		if (inode->i_sb != sb)
 			continue;
 		if (!test(inode, data))
@@ -520,8 +519,7 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head
 	struct inode * inode = NULL;
 
 repeat:
-	hlist_for_each (node, head) {
-		inode = hlist_entry(node, struct inode, i_hash);
+	hlist_for_each_entry(inode, node, head, i_hash) {
 		if (inode->i_ino != ino)
 			continue;
 		if (inode->i_sb != sb)
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 7b94a1e3c01..6676c06bb7c 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -598,7 +598,7 @@ asmlinkage long sys_inotify_init(void)
 	}
 
 	ih = inotify_init(&inotify_user_ops);
-	if (unlikely(IS_ERR(ih))) {
+	if (IS_ERR(ih)) {
 		ret = PTR_ERR(ih);
 		goto out_free_dev;
 	}
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f32fbde2175..7db32b3382d 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -28,8 +28,8 @@
  *
  * Returns 0 on success, -errno on error.
  */
-long vfs_ioctl(struct file *filp, unsigned int cmd,
-	       unsigned long arg)
+static long vfs_ioctl(struct file *filp, unsigned int cmd,
+		      unsigned long arg)
 {
 	int error = -ENOTTY;
 
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 1ba407c64df..2f0dc5a1463 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -145,6 +145,14 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 			}
 			de = tmpde;
 		}
+		/* Basic sanity check, whether name doesn't exceed dir entry */
+		if (de_len < de->name_len[0] +
+					sizeof(struct iso_directory_record)) {
+			printk(KERN_NOTICE "iso9660: Corrupted directory entry"
+			       " in block %lu of inode %lu\n", block,
+			       inode->i_ino);
+			return -EIO;
+		}
 
 		if (first_de) {
 			isofs_normalize_block_and_offset(de,
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index d1bdf8adb35..ccbf72faf27 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -78,29 +78,29 @@ static inline int isonum_712(char *p)
 }
 static inline unsigned int isonum_721(char *p)
 {
-	return le16_to_cpu(get_unaligned((__le16 *)p));
+	return get_unaligned_le16(p);
 }
 static inline unsigned int isonum_722(char *p)
 {
-	return be16_to_cpu(get_unaligned((__le16 *)p));
+	return get_unaligned_be16(p);
 }
 static inline unsigned int isonum_723(char *p)
 {
 	/* Ignore bigendian datum due to broken mastering programs */
-	return le16_to_cpu(get_unaligned((__le16 *)p));
+	return get_unaligned_le16(p);
 }
 static inline unsigned int isonum_731(char *p)
 {
-	return le32_to_cpu(get_unaligned((__le32 *)p));
+	return get_unaligned_le32(p);
 }
 static inline unsigned int isonum_732(char *p)
 {
-	return be32_to_cpu(get_unaligned((__le32 *)p));
+	return get_unaligned_be32(p);
 }
 static inline unsigned int isonum_733(char *p)
 {
 	/* Ignore bigendian datum due to broken mastering programs */
-	return le32_to_cpu(get_unaligned((__le32 *)p));
+	return get_unaligned_le32(p);
 }
 extern int iso_date(char *, int);
 
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 344b247bc29..8299889a835 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -111,6 +111,13 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 
 		dlen = de->name_len[0];
 		dpnt = de->name;
+		/* Basic sanity check, whether name doesn't exceed dir entry */
+		if (de_len < dlen + sizeof(struct iso_directory_record)) {
+			printk(KERN_NOTICE "iso9660: Corrupted directory entry"
+			       " in block %lu of inode %lu\n", block,
+			       dir->i_ino);
+			return 0;
+		}
 
 		if (sbi->s_rock &&
 		    ((i = get_rock_ridge_filename(de, tmpname, dir)))) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a8173081f83..e0139786f71 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -520,22 +520,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd_debug (3, "JBD: commit phase 2\n");
 
 	/*
-	 * First, drop modified flag: all accesses to the buffers
-	 * will be tracked for a new trasaction only -bzzz
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (commit_transaction->t_buffers) {
-		new_jh = jh = commit_transaction->t_buffers->b_tnext;
-		do {
-			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
-					new_jh->b_modified == 0);
-			new_jh->b_modified = 0;
-			new_jh = new_jh->b_tnext;
-		} while (new_jh != jh);
-	}
-	spin_unlock(&journal->j_list_lock);
-
-	/*
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
@@ -584,6 +568,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 	stats.u.run.rs_blocks_logged = 0;
 
+	J_ASSERT(commit_transaction->t_nr_buffers <=
+		 commit_transaction->t_outstanding_credits);
+
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 954cff001df..53632e3e845 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -534,7 +534,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_EMERG
 		       "%s: error: j_commit_request=%d, tid=%d\n",
-		       __FUNCTION__, journal->j_commit_request, tid);
+		       __func__, journal->j_commit_request, tid);
 	}
 	spin_unlock(&journal->j_state_lock);
 #endif
@@ -599,7 +599,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 
 			printk(KERN_ALERT "%s: journal block not found "
 					"at offset %lu on %s\n",
-				__FUNCTION__,
+				__func__,
 				blocknr,
 				bdevname(journal->j_dev, b));
 			err = -EIO;
@@ -904,19 +904,10 @@ static void jbd2_stats_proc_init(journal_t *journal)
 	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
 	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
-		struct proc_dir_entry *p;
-		p = create_proc_entry("history", S_IRUGO,
-				journal->j_proc_entry);
-		if (p) {
-			p->proc_fops = &jbd2_seq_history_fops;
-			p->data = journal;
-			p = create_proc_entry("info", S_IRUGO,
-						journal->j_proc_entry);
-			if (p) {
-				p->proc_fops = &jbd2_seq_info_fops;
-				p->data = journal;
-			}
-		}
+		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
+				 &jbd2_seq_history_fops, journal);
+		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
+				 &jbd2_seq_info_fops, journal);
 	}
 }
 
@@ -1006,13 +997,14 @@ fail:
  */
 
 /**
- *  journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
+ *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
  *  @bdev: Block device on which to create the journal
  *  @fs_dev: Device which hold journalled filesystem for this journal.
  *  @start: Block nr Start of journal.
  *  @len:  Length of the journal in blocks.
  *  @blocksize: blocksize of journalling device
- *  @returns: a newly created journal_t *
+ *
+ *  Returns: a newly created journal_t *
  *
  *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
  *  range of blocks on an arbitrary block device.
@@ -1036,7 +1028,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
-			__FUNCTION__);
+			__func__);
 		kfree(journal);
 		journal = NULL;
 		goto out;
@@ -1092,7 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
-			__FUNCTION__);
+			__func__);
 		kfree(journal);
 		return NULL;
 	}
@@ -1101,7 +1093,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	/* If that failed, give up */
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
-		       __FUNCTION__);
+		       __func__);
 		kfree(journal);
 		return NULL;
 	}
@@ -1187,7 +1179,7 @@ int jbd2_journal_create(journal_t *journal)
 		 */
 		printk(KERN_EMERG
 		       "%s: creation of journal on external device!\n",
-		       __FUNCTION__);
+		       __func__);
 		BUG();
 	}
 
@@ -1985,9 +1977,10 @@ static int journal_init_jbd2_journal_head_cache(void)
 
 static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
 {
-	J_ASSERT(jbd2_journal_head_cache != NULL);
-	kmem_cache_destroy(jbd2_journal_head_cache);
-	jbd2_journal_head_cache = NULL;
+	if (jbd2_journal_head_cache) {
+		kmem_cache_destroy(jbd2_journal_head_cache);
+		jbd2_journal_head_cache = NULL;
+	}
 }
 
 /*
@@ -2006,7 +1999,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 		jbd_debug(1, "out of memory for journal_head\n");
 		if (time_after(jiffies, last_warning + 5*HZ)) {
 			printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
-			       __FUNCTION__);
+			       __func__);
 			last_warning = jiffies;
 		}
 		while (!ret) {
@@ -2143,13 +2136,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
 			if (jh->b_frozen_data) {
 				printk(KERN_WARNING "%s: freeing "
 						"b_frozen_data\n",
-						__FUNCTION__);
+						__func__);
 				jbd2_free(jh->b_frozen_data, bh->b_size);
 			}
 			if (jh->b_committed_data) {
 				printk(KERN_WARNING "%s: freeing "
 						"b_committed_data\n",
-						__FUNCTION__);
+						__func__);
 				jbd2_free(jh->b_committed_data, bh->b_size);
 			}
 			bh->b_private = NULL;
@@ -2314,10 +2307,12 @@ static int __init journal_init(void)
 	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
 
 	ret = journal_init_caches();
-	if (ret != 0)
+	if (ret == 0) {
+		jbd2_create_debugfs_entry();
+		jbd2_create_jbd_stats_proc_entry();
+	} else {
 		jbd2_journal_destroy_caches();
-	jbd2_create_debugfs_entry();
-	jbd2_create_jbd_stats_proc_entry();
+	}
 	return ret;
 }
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 2e1453a5e99..257ff262576 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -139,7 +139,7 @@ repeat:
 oom:
 	if (!journal_oom_retry)
 		return -ENOMEM;
-	jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
+	jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
 	yield();
 	goto repeat;
 }
@@ -167,138 +167,121 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
 	return NULL;
 }
 
+void jbd2_journal_destroy_revoke_caches(void)
+{
+	if (jbd2_revoke_record_cache) {
+		kmem_cache_destroy(jbd2_revoke_record_cache);
+		jbd2_revoke_record_cache = NULL;
+	}
+	if (jbd2_revoke_table_cache) {
+		kmem_cache_destroy(jbd2_revoke_table_cache);
+		jbd2_revoke_table_cache = NULL;
+	}
+}
+
 int __init jbd2_journal_init_revoke_caches(void)
 {
+	J_ASSERT(!jbd2_revoke_record_cache);
+	J_ASSERT(!jbd2_revoke_table_cache);
+
 	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
 					   sizeof(struct jbd2_revoke_record_s),
 					   0,
 					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
 					   NULL);
 	if (!jbd2_revoke_record_cache)
-		return -ENOMEM;
+		goto record_cache_failure;
 
 	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
 					   sizeof(struct jbd2_revoke_table_s),
 					   0, SLAB_TEMPORARY, NULL);
-	if (!jbd2_revoke_table_cache) {
-		kmem_cache_destroy(jbd2_revoke_record_cache);
-		jbd2_revoke_record_cache = NULL;
-		return -ENOMEM;
-	}
+	if (!jbd2_revoke_table_cache)
+		goto table_cache_failure;
 	return 0;
+table_cache_failure:
+	jbd2_journal_destroy_revoke_caches();
+record_cache_failure:
+		return -ENOMEM;
 }
 
-void jbd2_journal_destroy_revoke_caches(void)
+static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
 {
-	kmem_cache_destroy(jbd2_revoke_record_cache);
-	jbd2_revoke_record_cache = NULL;
-	kmem_cache_destroy(jbd2_revoke_table_cache);
-	jbd2_revoke_table_cache = NULL;
-}
-
-/* Initialise the revoke table for a given journal to a given size. */
-
-int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
-{
-	int shift, tmp;
+	int shift = 0;
+	int tmp = hash_size;
+	struct jbd2_revoke_table_s *table;
 
-	J_ASSERT (journal->j_revoke_table[0] == NULL);
+	table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
 
-	shift = 0;
-	tmp = hash_size;
 	while((tmp >>= 1UL) != 0UL)
 		shift++;
 
-	journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
-	if (!journal->j_revoke_table[0])
-		return -ENOMEM;
-	journal->j_revoke = journal->j_revoke_table[0];
-
-	/* Check that the hash_size is a power of two */
-	J_ASSERT(is_power_of_2(hash_size));
-
-	journal->j_revoke->hash_size = hash_size;
-
-	journal->j_revoke->hash_shift = shift;
-
-	journal->j_revoke->hash_table =
+	table->hash_size = hash_size;
+	table->hash_shift = shift;
+	table->hash_table =
 		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!journal->j_revoke->hash_table) {
-		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
-		journal->j_revoke = NULL;
-		return -ENOMEM;
+	if (!table->hash_table) {
+		kmem_cache_free(jbd2_revoke_table_cache, table);
+		table = NULL;
+		goto out;
 	}
 
 	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+		INIT_LIST_HEAD(&table->hash_table[tmp]);
 
-	journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
-	if (!journal->j_revoke_table[1]) {
-		kfree(journal->j_revoke_table[0]->hash_table);
-		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
-		return -ENOMEM;
+out:
+	return table;
+}
+
+static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
 	}
 
-	journal->j_revoke = journal->j_revoke_table[1];
+	kfree(table->hash_table);
+	kmem_cache_free(jbd2_revoke_table_cache, table);
+}
 
-	/* Check that the hash_size is a power of two */
+/* Initialise the revoke table for a given journal to a given size. */
+int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
 	J_ASSERT(is_power_of_2(hash_size));
 
-	journal->j_revoke->hash_size = hash_size;
-
-	journal->j_revoke->hash_shift = shift;
+	journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
 
-	journal->j_revoke->hash_table =
-		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!journal->j_revoke->hash_table) {
-		kfree(journal->j_revoke_table[0]->hash_table);
-		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
-		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
-		journal->j_revoke = NULL;
-		return -ENOMEM;
-	}
+	journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
 
-	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+	journal->j_revoke = journal->j_revoke_table[1];
 
 	spin_lock_init(&journal->j_revoke_lock);
 
 	return 0;
-}
 
-/* Destoy a journal's revoke table.  The table must already be empty! */
+fail1:
+	jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
 
+/* Destroy a journal's revoke table.  The table must already be empty! */
 void jbd2_journal_destroy_revoke(journal_t *journal)
 {
-	struct jbd2_revoke_table_s *table;
-	struct list_head *hash_list;
-	int i;
-
-	table = journal->j_revoke_table[0];
-	if (!table)
-		return;
-
-	for (i=0; i<table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT (list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(jbd2_revoke_table_cache, table);
-	journal->j_revoke = NULL;
-
-	table = journal->j_revoke_table[1];
-	if (!table)
-		return;
-
-	for (i=0; i<table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT (list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(jbd2_revoke_table_cache, table);
 	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
 }
 
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b9b0b6f899b..d6e006e6780 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -618,6 +618,12 @@ repeat:
 		goto done;
 
 	/*
+	 * this is the first time this transaction is touching this buffer,
+	 * reset the modified flag
+	 */
+       jh->b_modified = 0;
+
+	/*
 	 * If there is already a copy-out version of this buffer, then we don't
 	 * need to make another one
 	 */
@@ -690,7 +696,7 @@ repeat:
 				if (!frozen_buffer) {
 					printk(KERN_EMERG
 					       "%s: OOM for frozen_buffer\n",
-					       __FUNCTION__);
+					       __func__);
 					JBUFFER_TRACE(jh, "oom!");
 					error = -ENOMEM;
 					jbd_lock_bh_state(bh);
@@ -829,9 +835,16 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 
 	if (jh->b_transaction == NULL) {
 		jh->b_transaction = transaction;
+
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
 	} else if (jh->b_transaction == journal->j_committing_transaction) {
+		/* first access by this transaction */
+		jh->b_modified = 0;
+
 		JBUFFER_TRACE(jh, "set next transaction");
 		jh->b_next_transaction = transaction;
 	}
@@ -901,7 +914,7 @@ repeat:
 		committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 		if (!committed_data) {
 			printk(KERN_EMERG "%s: No memory for committed data\n",
-				__FUNCTION__);
+				__func__);
 			err = -ENOMEM;
 			goto out;
 		}
@@ -1230,6 +1243,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh;
 	int drop_reserve = 0;
 	int err = 0;
+	int was_modified = 0;
 
 	BUFFER_TRACE(bh, "entry");
 
@@ -1248,6 +1262,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 		goto not_jbd;
 	}
 
+	/* keep track of wether or not this transaction modified us */
+	was_modified = jh->b_modified;
+
 	/*
 	 * The buffer's going from the transaction, we must drop
 	 * all references -bzzz
@@ -1265,7 +1282,12 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 
 		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
 
-		drop_reserve = 1;
+		/*
+		 * we only want to drop a reference if this transaction
+		 * modified the buffer
+		 */
+		if (was_modified)
+			drop_reserve = 1;
 
 		/*
 		 * We are no longer going to journal this buffer.
@@ -1305,7 +1327,13 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 		if (jh->b_next_transaction) {
 			J_ASSERT(jh->b_next_transaction == transaction);
 			jh->b_next_transaction = NULL;
-			drop_reserve = 1;
+
+			/*
+			 * only drop a reference if this transaction modified
+			 * the buffer
+			 */
+			if (was_modified)
+				drop_reserve = 1;
 		}
 	}
 
@@ -1434,7 +1462,8 @@ int jbd2_journal_stop(handle_t *handle)
 	return err;
 }
 
-/**int jbd2_journal_force_commit() - force any uncommitted transactions
+/**
+ * int jbd2_journal_force_commit() - force any uncommitted transactions
  * @journal: journal to force
  *
  * For synchronous operations: force any uncommitted transactions
@@ -2077,7 +2106,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
 	__jbd2_journal_file_buffer(jh, jh->b_transaction,
-				was_dirty ? BJ_Metadata : BJ_Reserved);
+				jh->b_modified ? BJ_Metadata : BJ_Reserved);
 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
 	if (was_dirty)
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index 9645275023e..a113ecc3baf 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -82,28 +82,28 @@
 	do {								\
 		printk(JFFS2_ERR_MSG_PREFIX				\
 			" (%d) %s: " fmt, task_pid_nr(current),		\
-			__FUNCTION__ , ##__VA_ARGS__);			\
+			__func__ , ##__VA_ARGS__);			\
 	} while(0)
 
 #define JFFS2_WARNING(fmt, ...)						\
 	do {								\
 		printk(JFFS2_WARN_MSG_PREFIX				\
 			" (%d) %s: " fmt, task_pid_nr(current),		\
-			__FUNCTION__ , ##__VA_ARGS__);			\
+			__func__ , ##__VA_ARGS__);			\
 	} while(0)
 
 #define JFFS2_NOTICE(fmt, ...)						\
 	do {								\
 		printk(JFFS2_NOTICE_MSG_PREFIX				\
 			" (%d) %s: " fmt, task_pid_nr(current),		\
-			__FUNCTION__ , ##__VA_ARGS__);			\
+			__func__ , ##__VA_ARGS__);			\
 	} while(0)
 
 #define JFFS2_DEBUG(fmt, ...)						\
 	do {								\
 		printk(JFFS2_DBG_MSG_PREFIX				\
 			" (%d) %s: " fmt, task_pid_nr(current),		\
-			__FUNCTION__ , ##__VA_ARGS__);			\
+			__func__ , ##__VA_ARGS__);			\
 	} while(0)
 
 /*
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index e48665984cb..574cb7532d6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -82,7 +82,7 @@ static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_
 static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
-	D1(dbg_xattr("%s: xid=%u, version=%u\n", __FUNCTION__, xd->xid, xd->version));
+	D1(dbg_xattr("%s: xid=%u, version=%u\n", __func__, xd->xid, xd->version));
 	if (xd->xname) {
 		c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
 		kfree(xd->xname);
@@ -1252,7 +1252,7 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
 	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
 	if (rc) {
 		JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
-			      __FUNCTION__, rc, totlen);
+			      __func__, rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
 	}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 887f5759e53..bf6ab19b86e 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -89,7 +89,7 @@ void jfs_proc_init(void)
 {
 	int i;
 
-	if (!(base = proc_mkdir("jfs", proc_root_fs)))
+	if (!(base = proc_mkdir("fs/jfs", NULL)))
 		return;
 	base->owner = THIS_MODULE;
 
@@ -109,7 +109,7 @@ void jfs_proc_clean(void)
 	if (base) {
 		for (i = 0; i < NPROCENT; i++)
 			remove_proc_entry(Entries[i].name, base);
-		remove_proc_entry("jfs", proc_root_fs);
+		remove_proc_entry("fs/jfs", NULL);
 	}
 }
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 40b16f23e49..5df517b81f3 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -573,7 +573,7 @@ again:
 		/* Ensure the resulting lock will get added to granted list */
 		fl->fl_flags |= FL_SLEEP;
 		if (do_vfs_lock(fl) < 0)
-			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
+			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 		up_read(&host->h_rwsem);
 		fl->fl_flags = fl_flags;
 		status = 0;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 4d81553d294..81aca859bfd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -752,7 +752,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 		return;
 	default:
 		printk(KERN_WARNING "lockd: unexpected error %d in %s!\n",
-				-error, __FUNCTION__);
+				-error, __func__);
 		nlmsvc_insert_block(block, 10 * HZ);
 		nlmsvc_release_block(block);
 		return;
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 2d4358c59f6..05ff4f1d702 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -609,7 +609,7 @@ error_inode:
 	if (corrupt < 0) {
 		fat_fs_panic(new_dir->i_sb,
 			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __FUNCTION__, sinfo.i_pos);
+			     __func__, sinfo.i_pos);
 	}
 	goto out;
 }
diff --git a/fs/namei.c b/fs/namei.c
index e179f71bfcb..32fd9655485 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fcntl.h>
+#include <linux/device_cgroup.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
 
@@ -281,6 +282,10 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 	if (retval)
 		return retval;
 
+	retval = devcgroup_inode_permission(inode, mask);
+	if (retval)
+		return retval;
+
 	return security_inode_permission(inode, mask, nd);
 }
 
@@ -2028,6 +2033,10 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if (!dir->i_op || !dir->i_op->mknod)
 		return -EPERM;
 
+	error = devcgroup_inode_mknod(mode, dev);
+	if (error)
+		return error;
+
 	error = security_inode_mknod(dir, dentry, mode, dev);
 	if (error)
 		return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index fe376805cf5..4fc302c2a0e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1176,17 +1176,6 @@ static int mount_is_safe(struct nameidata *nd)
 #endif
 }
 
-static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
-{
-	while (1) {
-		if (d == dentry)
-			return 1;
-		if (d == NULL || d == d->d_parent)
-			return 0;
-		d = d->d_parent;
-	}
-}
-
 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 					int flag)
 {
@@ -1203,7 +1192,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 
 	p = mnt;
 	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
-		if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry))
+		if (!is_subdir(r->mnt_mountpoint, dentry))
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
@@ -2340,10 +2329,10 @@ void __init mnt_init(void)
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
-			__FUNCTION__, err);
+			__func__, err);
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
-		printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__);
+		printk(KERN_WARNING "%s: kobj create error\n", __func__);
 	init_rootfs();
 	init_mount_tree();
 }
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index df6d60bdfcd..97645f11211 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -102,48 +102,47 @@ static inline void ncp_init_request_s(struct ncp_server *server, int subfunction
 }
 
 static inline char *
- ncp_reply_data(struct ncp_server *server, int offset)
+ncp_reply_data(struct ncp_server *server, int offset)
 {
 	return &(server->packet[sizeof(struct ncp_reply_header) + offset]);
 }
 
-static inline __u8 BVAL(void* data)
+static inline u8 BVAL(void *data)
 {
-	return get_unaligned((__u8*)data);
+	return *(u8 *)data;
 }
 
-static __u8
- ncp_reply_byte(struct ncp_server *server, int offset)
+static u8 ncp_reply_byte(struct ncp_server *server, int offset)
 {
-	return get_unaligned((__u8 *) ncp_reply_data(server, offset));
+	return *(u8 *)ncp_reply_data(server, offset);
 }
 
-static inline __u16 WVAL_LH(void* data)
+static inline u16 WVAL_LH(void *data)
 {
-	return le16_to_cpu(get_unaligned((__le16*)data));
+	return get_unaligned_le16(data);
 }
 
-static __u16
- ncp_reply_le16(struct ncp_server *server, int offset)
+static u16
+ncp_reply_le16(struct ncp_server *server, int offset)
 {
-	return le16_to_cpu(get_unaligned((__le16 *) ncp_reply_data(server, offset)));
+	return get_unaligned_le16(ncp_reply_data(server, offset));
 }
 
-static __u16
- ncp_reply_be16(struct ncp_server *server, int offset)
+static u16
+ncp_reply_be16(struct ncp_server *server, int offset)
 {
-	return be16_to_cpu(get_unaligned((__be16 *) ncp_reply_data(server, offset)));
+	return get_unaligned_be16(ncp_reply_data(server, offset));
 }
 
-static inline __u32 DVAL_LH(void* data)
+static inline u32 DVAL_LH(void *data)
 {
-	return le32_to_cpu(get_unaligned((__le32*)data));
+	return get_unaligned_le32(data);
 }
 
 static __le32
- ncp_reply_dword(struct ncp_server *server, int offset)
+ncp_reply_dword(struct ncp_server *server, int offset)
 {
-	return get_unaligned((__le32 *) ncp_reply_data(server, offset));
+	return get_unaligned((__le32 *)ncp_reply_data(server, offset));
 }
 
 static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) {
@@ -1006,8 +1005,8 @@ ncp_read_bounce(struct ncp_server *server, const char *file_id,
 	result = ncp_request2(server, 72, bounce, bufsize);
 	ncp_unlock_server(server);
 	if (!result) {
-		int len = be16_to_cpu(get_unaligned((__be16*)((char*)bounce + 
-			  sizeof(struct ncp_reply_header))));
+		int len = get_unaligned_be16((char *)bounce +
+			  sizeof(struct ncp_reply_header));
 		result = -EIO;
 		if (len <= to_read) {
 			char* source;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2f3b284e6d..89ac5bb0401 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1321,6 +1321,7 @@ static const struct file_operations nfs_server_list_fops = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
+	.owner		= THIS_MODULE,
 };
 
 static int nfs_volume_list_open(struct inode *inode, struct file *file);
@@ -1341,6 +1342,7 @@ static const struct file_operations nfs_volume_list_fops = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
+	.owner		= THIS_MODULE,
 };
 
 /*
@@ -1500,33 +1502,29 @@ int __init nfs_fs_proc_init(void)
 {
 	struct proc_dir_entry *p;
 
-	proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs);
+	proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
 	if (!proc_fs_nfs)
 		goto error_0;
 
 	proc_fs_nfs->owner = THIS_MODULE;
 
 	/* a file of servers with which we're dealing */
-	p = create_proc_entry("servers", S_IFREG|S_IRUGO, proc_fs_nfs);
+	p = proc_create("servers", S_IFREG|S_IRUGO,
+			proc_fs_nfs, &nfs_server_list_fops);
 	if (!p)
 		goto error_1;
 
-	p->proc_fops = &nfs_server_list_fops;
-	p->owner = THIS_MODULE;
-
 	/* a file of volumes that we have mounted */
-	p = create_proc_entry("volumes", S_IFREG|S_IRUGO, proc_fs_nfs);
+	p = proc_create("volumes", S_IFREG|S_IRUGO,
+			proc_fs_nfs, &nfs_volume_list_fops);
 	if (!p)
 		goto error_2;
-
-	p->proc_fops = &nfs_volume_list_fops;
-	p->owner = THIS_MODULE;
 	return 0;
 
 error_2:
 	remove_proc_entry("servers", proc_fs_nfs);
 error_1:
-	remove_proc_entry("nfsfs", proc_root_fs);
+	remove_proc_entry("fs/nfsfs", NULL);
 error_0:
 	return -ENOMEM;
 }
@@ -1538,7 +1536,7 @@ void nfs_fs_proc_exit(void)
 {
 	remove_proc_entry("volumes", proc_fs_nfs);
 	remove_proc_entry("servers", proc_fs_nfs);
-	remove_proc_entry("nfsfs", proc_root_fs);
+	remove_proc_entry("fs/nfsfs", NULL);
 }
 
 #endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fa220dc7460..7226a506f3c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1575,6 +1575,11 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return nfs_compare_mount_options(sb, server, mntflags);
 }
 
+static int nfs_bdi_register(struct nfs_server *server)
+{
+	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
+}
+
 static int nfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
@@ -1617,6 +1622,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (s->s_fs_info != server) {
 		nfs_free_server(server);
 		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
@@ -1664,6 +1673,7 @@ static void nfs_kill_super(struct super_block *s)
 {
 	struct nfs_server *server = NFS_SB(s);
 
+	bdi_unregister(&server->backing_dev_info);
 	kill_anon_super(s);
 	nfs_free_server(server);
 }
@@ -1708,6 +1718,10 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	if (s->s_fs_info != server) {
 		nfs_free_server(server);
 		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
@@ -1984,6 +1998,10 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	if (s->s_fs_info != server) {
 		nfs_free_server(server);
 		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
@@ -2070,6 +2088,10 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	if (s->s_fs_info != server) {
 		nfs_free_server(server);
 		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
@@ -2149,6 +2171,10 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 	if (s->s_fs_info != server) {
 		nfs_free_server(server);
 		server = NULL;
+	} else {
+		error = nfs_bdi_register(server);
+		if (error)
+			goto error_splat_super;
 	}
 
 	if (!s->s_root) {
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 562abf3380d..0b3ffa9840c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -104,7 +104,7 @@ xdr_writemem(__be32 *p, const void *ptr, int nbytes)
 } while (0)
 #define RESERVE_SPACE(nbytes)   do {                            \
 	p = xdr_reserve_space(xdr, nbytes);                     \
-	if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
+	if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \
 	BUG_ON(!p);                                             \
 } while (0)
 
@@ -134,7 +134,7 @@ xdr_error:                                      \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (!p) { \
 		dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
-			__FUNCTION__, __LINE__); \
+			__func__, __LINE__); \
 		return -EIO; \
 	} \
 } while (0)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 42f3820ee8f..5ac00c4fee9 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -169,6 +169,7 @@ static const struct file_operations exports_operations = {
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
+	.owner		= THIS_MODULE,
 };
 
 /*----------------------------------------------------------------------------*/
@@ -801,10 +802,9 @@ static int create_proc_exports_entry(void)
 	entry = proc_mkdir("fs/nfs", NULL);
 	if (!entry)
 		return -ENOMEM;
-	entry = create_proc_entry("fs/nfs/exports", 0, NULL);
+	entry = proc_create("exports", 0, entry, &exports_operations);
 	if (!entry)
 		return -ENOMEM;
-	entry->proc_fops =  &exports_operations;
 	return 0;
 }
 #else /* CONFIG_PROC_FS */
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 8ac37c33d12..5e6724c1afd 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -45,7 +45,7 @@ static void ntfs_debug(const char *f, ...);
 extern void __ntfs_debug (const char *file, int line, const char *function,
 	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
 #define ntfs_debug(f, a...)						\
-	__ntfs_debug(__FILE__, __LINE__, __FUNCTION__, f, ##a)
+	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
 
 extern void ntfs_debug_dump_runlist(const runlist_element *rl);
 
@@ -58,10 +58,10 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
 
 extern void __ntfs_warning(const char *function, const struct super_block *sb,
 		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-#define ntfs_warning(sb, f, a...)	__ntfs_warning(__FUNCTION__, sb, f, ##a)
+#define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
 
 extern void __ntfs_error(const char *function, const struct super_block *sb,
 		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-#define ntfs_error(sb, f, a...)		__ntfs_error(__FUNCTION__, sb, f, ##a)
+#define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
 
 #endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2ad5c8b104b..790defb847e 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1191,7 +1191,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 		if (size) {
 			page = ntfs_map_page(mftbmp_mapping,
 					ofs >> PAGE_CACHE_SHIFT);
-			if (unlikely(IS_ERR(page))) {
+			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Failed to read mft "
 						"bitmap, aborting.");
 				return PTR_ERR(page);
@@ -2118,7 +2118,7 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 	}
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (unlikely(IS_ERR(page))) {
+	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map page containing mft record "
 				"to format 0x%llx.", (long long)mft_no);
 		return PTR_ERR(page);
@@ -2519,7 +2519,7 @@ mft_rec_already_initialized:
 	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
-	if (unlikely(IS_ERR(page))) {
+	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map page containing allocated "
 				"mft record 0x%llx.", (long long)bit);
 		err = PTR_ERR(page);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 61a000f8524..e48aba698b7 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -327,7 +327,7 @@ clear_fields:
 
 static struct backing_dev_info dlmfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static struct inode *dlmfs_get_root_inode(struct super_block *sb)
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index e7dd1d4e347..0fdda2e8a4c 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -41,12 +41,12 @@
 #ifndef CONFIG_LDM_DEBUG
 #define ldm_debug(...)	do {} while (0)
 #else
-#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __FUNCTION__, f, ##a)
+#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a)
 #endif
 
-#define ldm_crit(f, a...)  _ldm_printk (KERN_CRIT,  __FUNCTION__, f, ##a)
-#define ldm_error(f, a...) _ldm_printk (KERN_ERR,   __FUNCTION__, f, ##a)
-#define ldm_info(f, a...)  _ldm_printk (KERN_INFO,  __FUNCTION__, f, ##a)
+#define ldm_crit(f, a...)  _ldm_printk (KERN_CRIT,  __func__, f, ##a)
+#define ldm_error(f, a...) _ldm_printk (KERN_ERR,   __func__, f, ##a)
+#define ldm_info(f, a...)  _ldm_printk (KERN_INFO,  __func__, f, ##a)
 
 __attribute__ ((format (printf, 3, 4)))
 static void _ldm_printk (const char *level, const char *function,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 07d6c4853fe..c135cbdd912 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -425,12 +425,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	cutime = cstime = utime = stime = cputime_zero;
 	cgtime = gtime = cputime_zero;
 
-	rcu_read_lock();
 	if (lock_task_sighand(task, &flags)) {
 		struct signal_struct *sig = task->signal;
 
 		if (sig->tty) {
-			tty_pgrp = pid_nr_ns(sig->tty->pgrp, ns);
+			struct pid *pgrp = tty_get_pgrp(sig->tty);
+			tty_pgrp = pid_nr_ns(pgrp, ns);
+			put_pid(pgrp);
 			tty_nr = new_encode_dev(tty_devnum(sig->tty));
 		}
 
@@ -469,7 +470,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		unlock_task_sighand(task, &flags);
 	}
-	rcu_read_unlock();
 
 	if (!whole || num_threads < 2)
 		wchan = get_wchan(task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c5e412a00b1..fcf02f2deeb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -195,12 +195,32 @@ static int proc_root_link(struct inode *inode, struct path *path)
 	return result;
 }
 
-#define MAY_PTRACE(task) \
-	(task == current || \
-	(task->parent == current && \
-	(task->ptrace & PT_PTRACED) && \
-	 (task_is_stopped_or_traced(task)) && \
-	 security_ptrace(current,task) == 0))
+/*
+ * Return zero if current may access user memory in @task, -error if not.
+ */
+static int check_mem_permission(struct task_struct *task)
+{
+	/*
+	 * A task can always look at itself, in case it chooses
+	 * to use system calls instead of load instructions.
+	 */
+	if (task == current)
+		return 0;
+
+	/*
+	 * If current is actively ptrace'ing, and would also be
+	 * permitted to freshly attach with ptrace now, permit it.
+	 */
+	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
+	    task_is_stopped_or_traced(task) &&
+	    ptrace_may_attach(task))
+		return 0;
+
+	/*
+	 * Noone else is allowed.
+	 */
+	return -EPERM;
+}
 
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
@@ -722,7 +742,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	if (!task)
 		goto out_no_task;
 
-	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
+	if (check_mem_permission(task))
 		goto out;
 
 	ret = -ENOMEM;
@@ -748,7 +768,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 
 		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 		retval = access_process_vm(task, src, page, this_len, 0);
-		if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
+		if (!retval || check_mem_permission(task)) {
 			if (!ret)
 				ret = -EIO;
 			break;
@@ -792,7 +812,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	if (!task)
 		goto out_no_task;
 
-	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
+	if (check_mem_permission(task))
 		goto out;
 
 	copied = -ENOMEM;
@@ -1181,6 +1201,81 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+	mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+	mm->num_exe_file_vmas--;
+	if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+		fput(mm->exe_file);
+		mm->exe_file = NULL;
+	}
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+	if (new_exe_file)
+		get_file(new_exe_file);
+	if (mm->exe_file)
+		fput(mm->exe_file);
+	mm->exe_file = new_exe_file;
+	mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+	struct file *exe_file;
+
+	/* We need mmap_sem to protect against races with removal of
+	 * VM_EXECUTABLE vmas */
+	down_read(&mm->mmap_sem);
+	exe_file = mm->exe_file;
+	if (exe_file)
+		get_file(exe_file);
+	up_read(&mm->mmap_sem);
+	return exe_file;
+}
+
+void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+	/* It's safe to write the exe_file pointer without exe_file_lock because
+	 * this is called during fork when the task is not yet in /proc */
+	newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
+static int proc_exe_link(struct inode *inode, struct path *exe_path)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	struct file *exe_file;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ENOENT;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		return -ENOENT;
+	exe_file = get_mm_exe_file(mm);
+	mmput(mm);
+	if (exe_file) {
+		*exe_path = exe_file->f_path;
+		path_get(&exe_file->f_path);
+		fput(exe_file);
+		return 0;
+	} else
+		return -ENOENT;
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a36ad3c75cf..9d53b39a9cf 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -69,12 +69,7 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
 
 		start = NULL;
-		if (dp->get_info) {
-			/* Handle old net routines */
-			n = dp->get_info(page, &start, *ppos, count);
-			if (n < count)
-				eof = 1;
-		} else if (dp->read_proc) {
+		if (dp->read_proc) {
 			/*
 			 * How to be a proc read function
 			 * ------------------------------
@@ -277,8 +272,11 @@ static int xlate_proc_name(const char *name,
 	int			len;
 	int 			rtn = 0;
 
+	de = *ret;
+	if (!de)
+		de = &proc_root;
+
 	spin_lock(&proc_subdir_lock);
-	de = &proc_root;
 	while (1) {
 		next = strchr(cp, '/');
 		if (!next)
@@ -385,20 +383,18 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 
 	lock_kernel();
 	spin_lock(&proc_subdir_lock);
-	if (de) {
-		for (de = de->subdir; de ; de = de->next) {
-			if (de->namelen != dentry->d_name.len)
-				continue;
-			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-				unsigned int ino;
+	for (de = de->subdir; de ; de = de->next) {
+		if (de->namelen != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
+			unsigned int ino;
 
-				ino = de->low_ino;
-				de_get(de);
-				spin_unlock(&proc_subdir_lock);
-				error = -EINVAL;
-				inode = proc_get_inode(dir->i_sb, ino, de);
-				goto out_unlock;
-			}
+			ino = de->low_ino;
+			de_get(de);
+			spin_unlock(&proc_subdir_lock);
+			error = -EINVAL;
+			inode = proc_get_inode(dir->i_sb, ino, de);
+			goto out_unlock;
 		}
 	}
 	spin_unlock(&proc_subdir_lock);
@@ -410,7 +406,8 @@ out_unlock:
 		d_add(dentry, inode);
 		return NULL;
 	}
-	de_put(de);
+	if (de)
+		de_put(de);
 	return ERR_PTR(error);
 }
 
@@ -440,10 +437,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	lock_kernel();
 
 	ino = inode->i_ino;
-	if (!de) {
-		ret = -EINVAL;
-		goto out;
-	}
 	i = filp->f_pos;
 	switch (i) {
 		case 0:
@@ -582,7 +575,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	/* make sure name is valid */
 	if (!name || !strlen(name)) goto out;
 
-	if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0)
+	if (xlate_proc_name(name, parent, &fn) != 0)
 		goto out;
 
 	/* At this point there must not be any '/' characters beyond *fn */
@@ -682,9 +675,10 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 	return ent;
 }
 
-struct proc_dir_entry *proc_create(const char *name, mode_t mode,
-				   struct proc_dir_entry *parent,
-				   const struct file_operations *proc_fops)
+struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
+					struct proc_dir_entry *parent,
+					const struct file_operations *proc_fops,
+					void *data)
 {
 	struct proc_dir_entry *pde;
 	nlink_t nlink;
@@ -705,6 +699,7 @@ struct proc_dir_entry *proc_create(const char *name, mode_t mode,
 	if (!pde)
 		goto out;
 	pde->proc_fops = proc_fops;
+	pde->data = data;
 	if (proc_register(parent, pde) < 0)
 		goto out_free;
 	return pde;
@@ -734,55 +729,58 @@ void free_proc_entry(struct proc_dir_entry *de)
 void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 {
 	struct proc_dir_entry **p;
-	struct proc_dir_entry *de;
+	struct proc_dir_entry *de = NULL;
 	const char *fn = name;
 	int len;
 
-	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
-		goto out;
+	if (xlate_proc_name(name, &parent, &fn) != 0)
+		return;
 	len = strlen(fn);
 
 	spin_lock(&proc_subdir_lock);
 	for (p = &parent->subdir; *p; p=&(*p)->next ) {
-		if (!proc_match(len, fn, *p))
-			continue;
-		de = *p;
-		*p = de->next;
-		de->next = NULL;
-
-		spin_lock(&de->pde_unload_lock);
-		/*
-		 * Stop accepting new callers into module. If you're
-		 * dynamically allocating ->proc_fops, save a pointer somewhere.
-		 */
-		de->proc_fops = NULL;
-		/* Wait until all existing callers into module are done. */
-		if (de->pde_users > 0) {
-			DECLARE_COMPLETION_ONSTACK(c);
-
-			if (!de->pde_unload_completion)
-				de->pde_unload_completion = &c;
-
-			spin_unlock(&de->pde_unload_lock);
-			spin_unlock(&proc_subdir_lock);
+		if (proc_match(len, fn, *p)) {
+			de = *p;
+			*p = de->next;
+			de->next = NULL;
+			break;
+		}
+	}
+	spin_unlock(&proc_subdir_lock);
+	if (!de)
+		return;
 
-			wait_for_completion(de->pde_unload_completion);
+	spin_lock(&de->pde_unload_lock);
+	/*
+	 * Stop accepting new callers into module. If you're
+	 * dynamically allocating ->proc_fops, save a pointer somewhere.
+	 */
+	de->proc_fops = NULL;
+	/* Wait until all existing callers into module are done. */
+	if (de->pde_users > 0) {
+		DECLARE_COMPLETION_ONSTACK(c);
+
+		if (!de->pde_unload_completion)
+			de->pde_unload_completion = &c;
 
-			spin_lock(&proc_subdir_lock);
-			goto continue_removing;
-		}
 		spin_unlock(&de->pde_unload_lock);
 
+		wait_for_completion(de->pde_unload_completion);
+
+		goto continue_removing;
+	}
+	spin_unlock(&de->pde_unload_lock);
+
 continue_removing:
-		if (S_ISDIR(de->mode))
-			parent->nlink--;
-		de->nlink = 0;
-		WARN_ON(de->subdir);
-		if (atomic_dec_and_test(&de->count))
-			free_proc_entry(de);
-		break;
+	if (S_ISDIR(de->mode))
+		parent->nlink--;
+	de->nlink = 0;
+	if (de->subdir) {
+		printk(KERN_WARNING "%s: removing non-empty directory "
+			"'%s/%s', leaking at least '%s'\n", __func__,
+			de->parent->name, de->name, de->subdir->name);
+		WARN_ON(1);
 	}
-	spin_unlock(&proc_subdir_lock);
-out:
-	return;
+	if (atomic_dec_and_test(&de->count))
+		free_proc_entry(de);
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 82b3a1b5a70..6f4e8dc97da 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -25,8 +25,7 @@
 
 struct proc_dir_entry *de_get(struct proc_dir_entry *de)
 {
-	if (de)
-		atomic_inc(&de->count);
+	atomic_inc(&de->count);
 	return de;
 }
 
@@ -35,18 +34,16 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
  */
 void de_put(struct proc_dir_entry *de)
 {
-	if (de) {	
-		lock_kernel();		
-		if (!atomic_read(&de->count)) {
-			printk("de_put: entry %s already free!\n", de->name);
-			unlock_kernel();
-			return;
-		}
-
-		if (atomic_dec_and_test(&de->count))
-			free_proc_entry(de);
+	lock_kernel();
+	if (!atomic_read(&de->count)) {
+		printk("de_put: entry %s already free!\n", de->name);
 		unlock_kernel();
+		return;
 	}
+
+	if (atomic_dec_and_test(&de->count))
+		free_proc_entry(de);
+	unlock_kernel();
 }
 
 /*
@@ -392,7 +389,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 {
 	struct inode * inode;
 
-	if (de != NULL && !try_module_get(de->owner))
+	if (!try_module_get(de->owner))
 		goto out_mod;
 
 	inode = iget_locked(sb, ino);
@@ -402,30 +399,29 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->fd = 0;
 		PROC_I(inode)->pde = de;
-		if (de) {
-			if (de->mode) {
-				inode->i_mode = de->mode;
-				inode->i_uid = de->uid;
-				inode->i_gid = de->gid;
-			}
-			if (de->size)
-				inode->i_size = de->size;
-			if (de->nlink)
-				inode->i_nlink = de->nlink;
-			if (de->proc_iops)
-				inode->i_op = de->proc_iops;
-			if (de->proc_fops) {
-				if (S_ISREG(inode->i_mode)) {
+
+		if (de->mode) {
+			inode->i_mode = de->mode;
+			inode->i_uid = de->uid;
+			inode->i_gid = de->gid;
+		}
+		if (de->size)
+			inode->i_size = de->size;
+		if (de->nlink)
+			inode->i_nlink = de->nlink;
+		if (de->proc_iops)
+			inode->i_op = de->proc_iops;
+		if (de->proc_fops) {
+			if (S_ISREG(inode->i_mode)) {
 #ifdef CONFIG_COMPAT
-					if (!de->proc_fops->compat_ioctl)
-						inode->i_fop =
-							&proc_reg_file_ops_no_compat;
-					else
+				if (!de->proc_fops->compat_ioctl)
+					inode->i_fop =
+						&proc_reg_file_ops_no_compat;
+				else
 #endif
-						inode->i_fop = &proc_reg_file_ops;
-				} else {
-					inode->i_fop = de->proc_fops;
-				}
+					inode->i_fop = &proc_reg_file_ops;
+			} else {
+				inode->i_fop = de->proc_fops;
 			}
 		}
 		unlock_new_inode(inode);
@@ -433,8 +429,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 	return inode;
 
 out_ino:
-	if (de != NULL)
-		module_put(de->owner);
+	module_put(de->owner);
 out_mod:
 	return NULL;
 }			
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index bc72f5c8c47..28cbca80590 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -11,6 +11,7 @@
 
 #include <linux/proc_fs.h>
 
+extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 #else
@@ -46,9 +47,6 @@ extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 
 extern int maps_protect;
 
-extern void create_seq_entry(char *name, mode_t mode,
-				const struct file_operations *f);
-extern int proc_exe_link(struct inode *, struct path *);
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task);
 extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 941e95114b5..79ecd281d2c 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -137,7 +137,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
 
 static int __init proc_nommu_init(void)
 {
-	create_seq_entry("maps", S_IRUGO, &proc_nommu_vma_list_operations);
+	proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
 	return 0;
 }
 
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 441a32f0e5f..74a323d2b85 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -179,6 +179,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"PageTables:   %8lu kB\n"
 		"NFS_Unstable: %8lu kB\n"
 		"Bounce:       %8lu kB\n"
+		"WritebackTmp: %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
@@ -210,6 +211,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_PAGETABLE)),
 		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(global_page_state(NR_BOUNCE)),
+		K(global_page_state(NR_WRITEBACK_TEMP)),
 		K(allowed),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
@@ -826,14 +828,6 @@ static struct file_operations proc_kpageflags_operations = {
 
 struct proc_dir_entry *proc_root_kcore;
 
-void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
-{
-	struct proc_dir_entry *entry;
-	entry = create_proc_entry(name, mode, NULL);
-	if (entry)
-		entry->proc_fops = f;
-}
-
 void __init proc_misc_init(void)
 {
 	static struct {
@@ -862,66 +856,52 @@ void __init proc_misc_init(void)
 
 	/* And now for trickier ones */
 #ifdef CONFIG_PRINTK
-	{
-		struct proc_dir_entry *entry;
-		entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
-		if (entry)
-			entry->proc_fops = &proc_kmsg_operations;
-	}
+	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
 #endif
-	create_seq_entry("locks", 0, &proc_locks_operations);
-	create_seq_entry("devices", 0, &proc_devinfo_operations);
-	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
+	proc_create("locks", 0, NULL, &proc_locks_operations);
+	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
-	create_seq_entry("partitions", 0, &proc_partitions_operations);
+	proc_create("partitions", 0, NULL, &proc_partitions_operations);
 #endif
-	create_seq_entry("stat", 0, &proc_stat_operations);
-	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
+	proc_create("stat", 0, NULL, &proc_stat_operations);
+	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
 #ifdef CONFIG_SLABINFO
-	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
-	create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
+	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
 #endif
 #ifdef CONFIG_MMU
 	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
 #endif
-	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
-	create_seq_entry("pagetypeinfo", S_IRUGO, &pagetypeinfo_file_ops);
-	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
-	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
+	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
+	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
-	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
+	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
 #endif
 #ifdef CONFIG_MODULES
-	create_seq_entry("modules", 0, &proc_modules_operations);
+	proc_create("modules", 0, NULL, &proc_modules_operations);
 #endif
 #ifdef CONFIG_SCHEDSTATS
-	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
+	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
 #endif
 #ifdef CONFIG_PROC_KCORE
-	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
-	if (proc_root_kcore) {
-		proc_root_kcore->proc_fops = &proc_kcore_operations;
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
+	if (proc_root_kcore)
 		proc_root_kcore->size =
 				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
-	}
 #endif
 #ifdef CONFIG_PROC_PAGE_MONITOR
-	create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations);
-	create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations);
+	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
+	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
 #endif
 #ifdef CONFIG_PROC_VMCORE
-	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
-	if (proc_vmcore)
-		proc_vmcore->proc_fops = &proc_vmcore_operations;
+	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
-	{
-		struct proc_dir_entry *entry;
-		entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL);
-		if (entry)
-			entry->proc_fops = &proc_sysrq_trigger_operations;
-	}
+	proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
 #endif
 }
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 614c34b6d1c..5acc001d49f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -165,8 +165,8 @@ out:
 	return err;
 }
 
-static ssize_t proc_sys_read(struct file *filp, char __user *buf,
-				size_t count, loff_t *ppos)
+static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
+		size_t count, loff_t *ppos, int write)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct ctl_table_header *head;
@@ -190,12 +190,12 @@ static ssize_t proc_sys_read(struct file *filp, char __user *buf,
 	 * and won't be until we finish.
 	 */
 	error = -EPERM;
-	if (sysctl_perm(table, MAY_READ))
+	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
 		goto out;
 
 	/* careful: calling conventions are nasty here */
 	res = count;
-	error = table->proc_handler(table, 0, filp, buf, &res, ppos);
+	error = table->proc_handler(table, write, filp, buf, &res, ppos);
 	if (!error)
 		error = res;
 out:
@@ -204,44 +204,16 @@ out:
 	return error;
 }
 
-static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
+static ssize_t proc_sys_read(struct file *filp, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct dentry *dentry = filp->f_dentry;
-	struct ctl_table_header *head;
-	struct ctl_table *table;
-	ssize_t error;
-	size_t res;
-
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	/* Has the sysctl entry disappeared on us? */
-	error = -ENOENT;
-	if (!table)
-		goto out;
-
-	/* Has the sysctl entry been replaced by a directory? */
-	error = -EISDIR;
-	if (!table->proc_handler)
-		goto out;
-
-	/*
-	 * At this point we know that the sysctl was not unregistered
-	 * and won't be until we finish.
-	 */
-	error = -EPERM;
-	if (sysctl_perm(table, MAY_WRITE))
-		goto out;
-
-	/* careful: calling conventions are nasty here */
-	res = count;
-	error = table->proc_handler(table, 1, filp, (char __user *)buf,
-				    &res, ppos);
-	if (!error)
-		error = res;
-out:
-	sysctl_head_finish(head);
+	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
+}
 
-	return error;
+static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
 
 
@@ -416,7 +388,7 @@ static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *
 		goto out;
 
 	/* Use the permissions on the sysctl table entry */
-	error = sysctl_perm(table, mask);
+	error = sysctl_perm(head->root, table, mask);
 out:
 	sysctl_head_finish(head);
 	return error;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 49816e00b51..21f490f5d65 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -5,7 +5,7 @@
  */
 
 #include <asm/uaccess.h>
-
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/time.h>
@@ -136,39 +136,54 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
-/*
- * This is the handler for /proc/tty/ldiscs
- */
-static int tty_ldiscs_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
+static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
 {
-	int	i;
-	int	len = 0;
-	off_t	begin = 0;
+	return (*pos < NR_LDISCS) ? pos : NULL;
+}
+
+static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (*pos < NR_LDISCS) ? pos : NULL;
+}
+
+static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
+{
+	int i = *(loff_t *)v;
 	struct tty_ldisc *ld;
 	
-	for (i=0; i < NR_LDISCS; i++) {
-		ld = tty_ldisc_get(i);
-		if (ld == NULL)
-			continue;
-		len += sprintf(page+len, "%-10s %2d\n",
-			       ld->name ? ld->name : "???", i);
-		tty_ldisc_put(i);
-		if (len+begin > off+count)
-			break;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
-	}
-	if (i >= NR_LDISCS)
-		*eof = 1;
-	if (off >= len+begin)
+	ld = tty_ldisc_get(i);
+	if (ld == NULL)
 		return 0;
-	*start = page + (off-begin);
-	return ((count < begin+len-off) ? count : begin+len-off);
+	seq_printf(m, "%-10s %2d\n", ld->name ? ld->name : "???", i);
+	tty_ldisc_put(i);
+	return 0;
+}
+
+static const struct seq_operations tty_ldiscs_seq_ops = {
+	.start	= tty_ldiscs_seq_start,
+	.next	= tty_ldiscs_seq_next,
+	.stop	= tty_ldiscs_seq_stop,
+	.show	= tty_ldiscs_seq_show,
+};
+
+static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &tty_ldiscs_seq_ops);
 }
 
+static const struct file_operations tty_ldiscs_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= proc_tty_ldiscs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
@@ -177,16 +192,14 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
 	struct proc_dir_entry *ent;
 		
-	if ((!driver->read_proc && !driver->write_proc) ||
-	    !driver->driver_name ||
+	if (!driver->ops->read_proc || !driver->driver_name ||
 	    driver->proc_entry)
 		return;
 
 	ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
 	if (!ent)
 		return;
-	ent->read_proc = driver->read_proc;
-	ent->write_proc = driver->write_proc;
+	ent->read_proc = driver->ops->read_proc;
 	ent->owner = driver->owner;
 	ent->data = driver;
 
@@ -214,7 +227,6 @@ void proc_tty_unregister_driver(struct tty_driver *driver)
  */
 void __init proc_tty_init(void)
 {
-	struct proc_dir_entry *entry;
 	if (!proc_mkdir("tty", NULL))
 		return;
 	proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL);
@@ -224,10 +236,7 @@ void __init proc_tty_init(void)
 	 * password lengths and inter-keystroke timings during password
 	 * entry.
 	 */
-	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR | S_IXUSR, NULL);
-
-	create_proc_read_entry("tty/ldiscs", 0, NULL, tty_ldiscs_read_proc, NULL);
-	entry = create_proc_entry("tty/drivers", 0, NULL);
-	if (entry)
-		entry->proc_fops = &proc_tty_drivers_operations;
+	proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
+	proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops);
+	proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations);
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef0fb57fc9e..95117538a4f 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -22,8 +22,6 @@
 
 #include "internal.h"
 
-struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver;
-
 static int proc_test_super(struct super_block *sb, void *data)
 {
 	return sb->s_fs_info == data;
@@ -126,8 +124,8 @@ void __init proc_root_init(void)
 #ifdef CONFIG_SYSVIPC
 	proc_mkdir("sysvipc", NULL);
 #endif
-	proc_root_fs = proc_mkdir("fs", NULL);
-	proc_root_driver = proc_mkdir("driver", NULL);
+	proc_mkdir("fs", NULL);
+	proc_mkdir("driver", NULL);
 	proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
 #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
 	/* just give it a mountpoint */
@@ -137,7 +135,7 @@ void __init proc_root_init(void)
 #ifdef CONFIG_PROC_DEVICETREE
 	proc_device_tree_init();
 #endif
-	proc_bus = proc_mkdir("bus", NULL);
+	proc_mkdir("bus", NULL);
 	proc_sys_init();
 }
 
@@ -232,9 +230,5 @@ void pid_ns_release_proc(struct pid_namespace *ns)
 EXPORT_SYMBOL(proc_symlink);
 EXPORT_SYMBOL(proc_mkdir);
 EXPORT_SYMBOL(create_proc_entry);
-EXPORT_SYMBOL(proc_create);
+EXPORT_SYMBOL(proc_create_data);
 EXPORT_SYMBOL(remove_proc_entry);
-EXPORT_SYMBOL(proc_root);
-EXPORT_SYMBOL(proc_root_fs);
-EXPORT_SYMBOL(proc_bus);
-EXPORT_SYMBOL(proc_root_driver);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7415eeb7cc3..e2b8e769f51 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,40 +75,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return mm->total_vm;
 }
 
-int proc_exe_link(struct inode *inode, struct path *path)
-{
-	struct vm_area_struct * vma;
-	int result = -ENOENT;
-	struct task_struct *task = get_proc_task(inode);
-	struct mm_struct * mm = NULL;
-
-	if (task) {
-		mm = get_task_mm(task);
-		put_task_struct(task);
-	}
-	if (!mm)
-		goto out;
-	down_read(&mm->mmap_sem);
-
-	vma = mm->mmap;
-	while (vma) {
-		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
-			break;
-		vma = vma->vm_next;
-	}
-
-	if (vma) {
-		*path = vma->vm_file->f_path;
-		path_get(&vma->vm_file->f_path);
-		result = 0;
-	}
-
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-out:
-	return result;
-}
-
 static void pad_len_spaces(struct seq_file *m, int len)
 {
 	len = 25 + sizeof(void*) * 6 - len;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8011528518b..4b733f10845 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -103,40 +103,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return size;
 }
 
-int proc_exe_link(struct inode *inode, struct path *path)
-{
-	struct vm_list_struct *vml;
-	struct vm_area_struct *vma;
-	struct task_struct *task = get_proc_task(inode);
-	struct mm_struct *mm = get_task_mm(task);
-	int result = -ENOENT;
-
-	if (!mm)
-		goto out;
-	down_read(&mm->mmap_sem);
-
-	vml = mm->context.vmlist;
-	vma = NULL;
-	while (vml) {
-		if ((vml->vma->vm_flags & VM_EXECUTABLE) && vml->vma->vm_file) {
-			vma = vml->vma;
-			break;
-		}
-		vml = vml->next;
-	}
-
-	if (vma) {
-		*path = vma->vm_file->f_path;
-		path_get(&vma->vm_file->f_path);
-		result = 0;
-	}
-
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-out:
-	return result;
-}
-
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 23b647f25d0..234ada90363 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -306,7 +306,7 @@ static uint find_free_dqentry(struct dquot *dquot, int *err)
 			printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
 			goto out_buf;
 		}
-	dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1);
+	le16_add_cpu(&dh->dqdh_entries, 1);
 	memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
 	/* Find free structure in block */
 	for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
@@ -448,7 +448,7 @@ static int free_dqentry(struct dquot *dquot, uint blk)
 		goto out_buf;
 	}
 	dh = (struct v2_disk_dqdbheader *)buf;
-	dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)-1);
+	le16_add_cpu(&dh->dqdh_entries, -1);
 	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
 		if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
 		    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index b41a514b097..9590b902430 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -26,6 +26,9 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/ramfs.h>
+
+#include "internal.h"
 
 const struct address_space_operations ramfs_aops = {
 	.readpage	= simple_readpage,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8428d5b2711..b13123424e4 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -44,7 +44,7 @@ static const struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK |
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
 			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
 			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
 };
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index af7cc074a47..6b330639b51 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -11,5 +11,4 @@
 
 
 extern const struct address_space_operations ramfs_aops;
-extern const struct file_operations ramfs_file_operations;
 extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index da86042b3e0..e396b2fa474 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2574,11 +2574,9 @@ static int release_journal_dev(struct super_block *super,
 
 	result = 0;
 
-	if (journal->j_dev_file != NULL) {
-		result = filp_close(journal->j_dev_file, NULL);
-		journal->j_dev_file = NULL;
-		journal->j_dev_bd = NULL;
-	} else if (journal->j_dev_bd != NULL) {
+	if (journal->j_dev_bd != NULL) {
+		if (journal->j_dev_bd->bd_dev != super->s_dev)
+			bd_release(journal->j_dev_bd);
 		result = blkdev_put(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
 	}
@@ -2603,7 +2601,6 @@ static int journal_init_dev(struct super_block *super,
 	result = 0;
 
 	journal->j_dev_bd = NULL;
-	journal->j_dev_file = NULL;
 	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
 	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 
@@ -2620,35 +2617,34 @@ static int journal_init_dev(struct super_block *super,
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
-		} else if (jdev != super->s_dev)
+		} else if (jdev != super->s_dev) {
+			result = bd_claim(journal->j_dev_bd, journal);
+			if (result) {
+				blkdev_put(journal->j_dev_bd);
+				return result;
+			}
+
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
+		}
+
 		return 0;
 	}
 
-	journal->j_dev_file = filp_open(jdev_name, 0, 0);
-	if (!IS_ERR(journal->j_dev_file)) {
-		struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
-		if (!S_ISBLK(jdev_inode->i_mode)) {
-			reiserfs_warning(super, "journal_init_dev: '%s' is "
-					 "not a block device", jdev_name);
-			result = -ENOTBLK;
-			release_journal_dev(super, journal);
-		} else {
-			/* ok */
-			journal->j_dev_bd = I_BDEV(jdev_inode);
-			set_blocksize(journal->j_dev_bd, super->s_blocksize);
-			reiserfs_info(super,
-				      "journal_init_dev: journal device: %s\n",
-				      bdevname(journal->j_dev_bd, b));
-		}
-	} else {
-		result = PTR_ERR(journal->j_dev_file);
-		journal->j_dev_file = NULL;
+	journal->j_dev_bd = open_bdev_excl(jdev_name, 0, journal);
+	if (IS_ERR(journal->j_dev_bd)) {
+		result = PTR_ERR(journal->j_dev_bd);
+		journal->j_dev_bd = NULL;
 		reiserfs_warning(super,
 				 "journal_init_dev: Cannot open '%s': %i",
 				 jdev_name, result);
+		return result;
 	}
-	return result;
+
+	set_blocksize(journal->j_dev_bd, super->s_blocksize);
+	reiserfs_info(super,
+		      "journal_init_dev: journal device: %s\n",
+		      bdevname(journal->j_dev_bd, b));
+	return 0;
 }
 
 /**
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 8f86c52b30d..b9dbeeca704 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -467,6 +467,7 @@ static const struct file_operations r_file_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = seq_release,
+	.owner = THIS_MODULE,
 };
 
 static struct proc_dir_entry *proc_info_root = NULL;
@@ -475,12 +476,8 @@ static const char proc_info_root_name[] = "fs/reiserfs";
 static void add_file(struct super_block *sb, char *name,
 		     int (*func) (struct seq_file *, struct super_block *))
 {
-	struct proc_dir_entry *de;
-	de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir);
-	if (de) {
-		de->data = func;
-		de->proc_fops = &r_file_operations;
-	}
+	proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
+			 &r_file_operations, func);
 }
 
 int reiserfs_proc_info_init(struct super_block *sb)
diff --git a/fs/select.c b/fs/select.c
index 00f58c5c7e0..2c292146e24 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -425,7 +425,7 @@ sticky:
 	return ret;
 }
 
-#ifdef TIF_RESTORE_SIGMASK
+#ifdef HAVE_SET_RESTORE_SIGMASK
 asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		fd_set __user *exp, struct timespec __user *tsp,
 		const sigset_t __user *sigmask, size_t sigsetsize)
@@ -498,7 +498,7 @@ sticky:
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 					sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -528,7 +528,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
 
 	return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
 }
-#endif /* TIF_RESTORE_SIGMASK */
+#endif /* HAVE_SET_RESTORE_SIGMASK */
 
 struct poll_list {
 	struct poll_list *next;
@@ -759,7 +759,7 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 	return ret;
 }
 
-#ifdef TIF_RESTORE_SIGMASK
+#ifdef HAVE_SET_RESTORE_SIGMASK
 asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	struct timespec __user *tsp, const sigset_t __user *sigmask,
 	size_t sigsetsize)
@@ -805,7 +805,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		if (sigmask) {
 			memcpy(&current->saved_sigmask, &sigsaved,
 					sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
+			set_restore_sigmask();
 		}
 		ret = -ERESTARTNOHAND;
 	} else if (sigmask)
@@ -839,4 +839,4 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 
 	return ret;
 }
-#endif /* TIF_RESTORE_SIGMASK */
+#endif /* HAVE_SET_RESTORE_SIGMASK */
diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h
index 734972b9269..fc4b1a5dd75 100644
--- a/fs/smbfs/smb_debug.h
+++ b/fs/smbfs/smb_debug.h
@@ -11,14 +11,14 @@
  * these are normally enabled.
  */
 #ifdef SMBFS_PARANOIA
-# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __FUNCTION__ , ## a)
+# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a)
 #else
 # define PARANOIA(f, a...) do { ; } while(0)
 #endif
 
 /* lots of debug messages */
 #ifdef SMBFS_DEBUG_VERBOSE
-# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __FUNCTION__ , ## a)
+# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
 #else
 # define VERBOSE(f, a...) do { ; } while(0)
 #endif
@@ -28,7 +28,7 @@
  * too common name.
  */
 #ifdef SMBFS_DEBUG
-#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __FUNCTION__ , ## a)
+#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a)
 #else
 #define DEBUG1(f, a...) do { ; } while(0)
 #endif
diff --git a/fs/splice.c b/fs/splice.c
index eeb1a86a701..633f58ebfb7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1075,7 +1075,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos += ret;
+		*ppos = sd.pos;
 
 	return ret;
 }
diff --git a/fs/super.c b/fs/super.c
index a5a4aca7e22..453877c5697 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -117,7 +117,7 @@ static inline void destroy_super(struct super_block *s)
  * Drop a superblock's refcount.  Returns non-zero if the superblock was
  * destroyed.  The caller must hold sb_lock.
  */
-int __put_super(struct super_block *sb)
+static int __put_super(struct super_block *sb)
 {
 	int ret = 0;
 
diff --git a/fs/sync.c b/fs/sync.c
index 7cd005ea763..228e17b5e9e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -64,7 +64,7 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	/* sync the superblock to buffers */
 	sb = inode->i_sb;
 	lock_super(sb);
-	if (sb->s_op->write_super)
+	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
 	unlock_super(sb);
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dbdfabbfd60..e7735f643cd 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -135,7 +135,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 			goto out;
 	}
 	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
-		 __FUNCTION__, count, *ppos, buffer->page);
+		 __func__, count, *ppos, buffer->page);
 	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
 					 buffer->count);
 out:
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index d9262f74f94..f8b82e73b3b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -30,7 +30,7 @@ static const struct address_space_operations sysfs_aops = {
 
 static struct backing_dev_info sysfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
 static const struct inode_operations sysfs_inode_operations ={
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 74168266cd5..14f0023984d 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -61,7 +61,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	/* instantiate and link root dentry */
 	root = d_alloc_root(inode);
 	if (!root) {
-		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		pr_debug("%s: could not get root dentry!\n",__func__);
 		iput(inode);
 		return -ENOMEM;
 	}
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 42d51d1c05c..38ebe3f85b3 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -217,9 +217,9 @@ static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d)
 	if (sbi->s_bytesex == BYTESEX_PDP)
 		*(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d);
 	else if (sbi->s_bytesex == BYTESEX_LE)
-		*(__le32*)n = cpu_to_le32(le32_to_cpu(*(__le32*)n)+d);
+		le32_add_cpu((__le32 *)n, d);
 	else
-		*(__be32*)n = cpu_to_be32(be32_to_cpu(*(__be32*)n)+d);
+		be32_add_cpu((__be32 *)n, d);
 	return *n;
 }
 
@@ -242,9 +242,9 @@ static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n)
 static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d)
 {
 	if (sbi->s_bytesex != BYTESEX_BE)
-		*(__le16*)n = cpu_to_le16(le16_to_cpu(*(__le16 *)n)+d);
+		le16_add_cpu((__le16 *)n, d);
 	else
-		*(__be16*)n = cpu_to_be16(be16_to_cpu(*(__be16 *)n)+d);
+		be16_add_cpu((__be16 *)n, d);
 	return *n;
 }
 
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 10c80b59ec4..5400524e9cb 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -20,6 +20,7 @@
 #include <linux/hrtimer.h>
 #include <linux/anon_inodes.h>
 #include <linux/timerfd.h>
+#include <linux/syscalls.h>
 
 struct timerfd_ctx {
 	struct hrtimer tmr;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b564fc140fe..9fb18a340fc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -240,7 +240,7 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
 	sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
 				  GFP_KERNEL);
 	if (!sbi->s_partmaps) {
-		udf_error(sb, __FUNCTION__,
+		udf_error(sb, __func__,
 			  "Unable to allocate space for %d partition maps",
 			  count);
 		sbi->s_partitions = 0;
@@ -1086,7 +1086,7 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 		bitmap = vmalloc(size); /* TODO: get rid of vmalloc */
 
 	if (bitmap == NULL) {
-		udf_error(sb, __FUNCTION__,
+		udf_error(sb, __func__,
 			  "Unable to allocate space for bitmap "
 			  "and %d buffer_head pointers", nr_groups);
 		return NULL;
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 5b66162d074..a3522727ea5 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -986,7 +986,7 @@ error_inode:
 	if (corrupt < 0) {
 		fat_fs_panic(new_dir->i_sb,
 			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __FUNCTION__, sinfo.i_pos);
+			     __func__, sinfo.i_pos);
 	}
 	goto out;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index 89a942f07e1..4706a8b1f49 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -67,7 +67,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 }
 
 int
-vfs_setxattr(struct dentry *dentry, char *name, void *value,
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
@@ -131,7 +131,7 @@ out_noalloc:
 EXPORT_SYMBOL_GPL(xattr_getsecurity);
 
 ssize_t
-vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
+vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
@@ -187,7 +187,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
 EXPORT_SYMBOL_GPL(vfs_listxattr);
 
 int
-vfs_removexattr(struct dentry *dentry, char *name)
+vfs_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
@@ -218,7 +218,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
  * Extended attribute SET operations
  */
 static long
-setxattr(struct dentry *d, char __user *name, void __user *value,
+setxattr(struct dentry *d, const char __user *name, const void __user *value,
 	 size_t size, int flags)
 {
 	int error;
@@ -252,8 +252,8 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 }
 
 asmlinkage long
-sys_setxattr(char __user *path, char __user *name, void __user *value,
-	     size_t size, int flags)
+sys_setxattr(const char __user *path, const char __user *name,
+	     const void __user *value, size_t size, int flags)
 {
 	struct nameidata nd;
 	int error;
@@ -271,8 +271,8 @@ sys_setxattr(char __user *path, char __user *name, void __user *value,
 }
 
 asmlinkage long
-sys_lsetxattr(char __user *path, char __user *name, void __user *value,
-	      size_t size, int flags)
+sys_lsetxattr(const char __user *path, const char __user *name,
+	      const void __user *value, size_t size, int flags)
 {
 	struct nameidata nd;
 	int error;
@@ -290,7 +290,7 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value,
 }
 
 asmlinkage long
-sys_fsetxattr(int fd, char __user *name, void __user *value,
+sys_fsetxattr(int fd, const char __user *name, const void __user *value,
 	      size_t size, int flags)
 {
 	struct file *f;
@@ -315,7 +315,8 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
  * Extended attribute GET operations
  */
 static ssize_t
-getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
+getxattr(struct dentry *d, const char __user *name, void __user *value,
+	 size_t size)
 {
 	ssize_t error;
 	void *kvalue = NULL;
@@ -349,8 +350,8 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 }
 
 asmlinkage ssize_t
-sys_getxattr(char __user *path, char __user *name, void __user *value,
-	     size_t size)
+sys_getxattr(const char __user *path, const char __user *name,
+	     void __user *value, size_t size)
 {
 	struct nameidata nd;
 	ssize_t error;
@@ -364,7 +365,7 @@ sys_getxattr(char __user *path, char __user *name, void __user *value,
 }
 
 asmlinkage ssize_t
-sys_lgetxattr(char __user *path, char __user *name, void __user *value,
+sys_lgetxattr(const char __user *path, const char __user *name, void __user *value,
 	      size_t size)
 {
 	struct nameidata nd;
@@ -379,7 +380,7 @@ sys_lgetxattr(char __user *path, char __user *name, void __user *value,
 }
 
 asmlinkage ssize_t
-sys_fgetxattr(int fd, char __user *name, void __user *value, size_t size)
+sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size)
 {
 	struct file *f;
 	ssize_t error = -EBADF;
@@ -424,7 +425,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 }
 
 asmlinkage ssize_t
-sys_listxattr(char __user *path, char __user *list, size_t size)
+sys_listxattr(const char __user *path, char __user *list, size_t size)
 {
 	struct nameidata nd;
 	ssize_t error;
@@ -438,7 +439,7 @@ sys_listxattr(char __user *path, char __user *list, size_t size)
 }
 
 asmlinkage ssize_t
-sys_llistxattr(char __user *path, char __user *list, size_t size)
+sys_llistxattr(const char __user *path, char __user *list, size_t size)
 {
 	struct nameidata nd;
 	ssize_t error;
@@ -470,7 +471,7 @@ sys_flistxattr(int fd, char __user *list, size_t size)
  * Extended attribute REMOVE operations
  */
 static long
-removexattr(struct dentry *d, char __user *name)
+removexattr(struct dentry *d, const char __user *name)
 {
 	int error;
 	char kname[XATTR_NAME_MAX + 1];
@@ -485,7 +486,7 @@ removexattr(struct dentry *d, char __user *name)
 }
 
 asmlinkage long
-sys_removexattr(char __user *path, char __user *name)
+sys_removexattr(const char __user *path, const char __user *name)
 {
 	struct nameidata nd;
 	int error;
@@ -503,7 +504,7 @@ sys_removexattr(char __user *path, char __user *name)
 }
 
 asmlinkage long
-sys_lremovexattr(char __user *path, char __user *name)
+sys_lremovexattr(const char __user *path, const char __user *name)
 {
 	struct nameidata nd;
 	int error;
@@ -521,7 +522,7 @@ sys_lremovexattr(char __user *path, char __user *name)
 }
 
 asmlinkage long
-sys_fremovexattr(int fd, char __user *name)
+sys_fremovexattr(int fd, const char __user *name)
 {
 	struct file *f;
 	struct dentry *dentry;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 524021ff543..3f53dd101f9 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -64,3 +64,16 @@ config XFS_RT
 	  See the xfs man page in section 5 for additional information.
 
 	  If unsure, say N.
+
+config XFS_DEBUG
+	bool "XFS Debugging support (EXPERIMENTAL)"
+	depends on XFS_FS && EXPERIMENTAL
+	help
+	  Say Y here to get an XFS build with many debugging features,
+	  including ASSERT checks, function wrappers around macros,
+	  and extra sanity-checking functions in various code paths.
+
+	  Note that the resulting code will be HUGE and SLOW, and probably
+	  not useful unless you are debugging a particular problem.
+
+	  Say N unless you are an XFS developer, or you play one on TV.
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
index c110bb00266..ff6a19873e5 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -20,29 +20,24 @@
 
 #include <linux/rwsem.h>
 
-enum { MR_NONE, MR_ACCESS, MR_UPDATE };
-
 typedef struct {
 	struct rw_semaphore	mr_lock;
+#ifdef DEBUG
 	int			mr_writer;
+#endif
 } mrlock_t;
 
+#ifdef DEBUG
 #define mrinit(mrp, name)	\
 	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
+#define mrinit(mrp, name)	\
+	do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
 
-static inline void mraccess(mrlock_t *mrp)
-{
-	down_read(&mrp->mr_lock);
-}
-
-static inline void mrupdate(mrlock_t *mrp)
-{
-	down_write(&mrp->mr_lock);
-	mrp->mr_writer = 1;
-}
-
 static inline void mraccess_nested(mrlock_t *mrp, int subclass)
 {
 	down_read_nested(&mrp->mr_lock, subclass);
@@ -51,10 +46,11 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass)
 static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
 {
 	down_write_nested(&mrp->mr_lock, subclass);
+#ifdef DEBUG
 	mrp->mr_writer = 1;
+#endif
 }
 
-
 static inline int mrtryaccess(mrlock_t *mrp)
 {
 	return down_read_trylock(&mrp->mr_lock);
@@ -64,39 +60,31 @@ static inline int mrtryupdate(mrlock_t *mrp)
 {
 	if (!down_write_trylock(&mrp->mr_lock))
 		return 0;
+#ifdef DEBUG
 	mrp->mr_writer = 1;
+#endif
 	return 1;
 }
 
-static inline void mrunlock(mrlock_t *mrp)
+static inline void mrunlock_excl(mrlock_t *mrp)
 {
-	if (mrp->mr_writer) {
-		mrp->mr_writer = 0;
-		up_write(&mrp->mr_lock);
-	} else {
-		up_read(&mrp->mr_lock);
-	}
+#ifdef DEBUG
+	mrp->mr_writer = 0;
+#endif
+	up_write(&mrp->mr_lock);
 }
 
-static inline void mrdemote(mrlock_t *mrp)
+static inline void mrunlock_shared(mrlock_t *mrp)
 {
-	mrp->mr_writer = 0;
-	downgrade_write(&mrp->mr_lock);
+	up_read(&mrp->mr_lock);
 }
 
-#ifdef DEBUG
-/*
- * Debug-only routine, without some platform-specific asm code, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- * Note: means !ismrlocked would give false positives, so don't do that.
- */
-static inline int ismrlocked(mrlock_t *mrp, int type)
+static inline void mrdemote(mrlock_t *mrp)
 {
-	if (mrp && type == MR_UPDATE)
-		return mrp->mr_writer;
-	return 1;
-}
+#ifdef DEBUG
+	mrp->mr_writer = 0;
 #endif
+	downgrade_write(&mrp->mr_lock);
+}
 
 #endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 52f6846101d..5105015a75a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -886,7 +886,7 @@ int
 xfs_buf_lock_value(
 	xfs_buf_t		*bp)
 {
-	return atomic_read(&bp->b_sema.count);
+	return bp->b_sema.count;
 }
 #endif
 
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 265f0168ab7..c672b3238b1 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -133,7 +133,7 @@ xfs_nfs_get_inode(
 	if (!ip)
 		return ERR_PTR(-EIO);
 
-	if (!ip->i_d.di_mode || ip->i_d.di_gen != generation) {
+	if (ip->i_d.di_gen != generation) {
 		xfs_iput_new(ip, XFS_ILOCK_SHARED);
 		return ERR_PTR(-ENOENT);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 05905246434..65e78c13d4a 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -43,9 +43,6 @@
 #include <linux/smp_lock.h>
 
 static struct vm_operations_struct xfs_file_vm_ops;
-#ifdef CONFIG_XFS_DMAPI
-static struct vm_operations_struct xfs_dmapi_file_vm_ops;
-#endif
 
 STATIC_INLINE ssize_t
 __xfs_file_read(
@@ -202,22 +199,6 @@ xfs_file_fsync(
 			(xfs_off_t)0, (xfs_off_t)-1);
 }
 
-#ifdef CONFIG_XFS_DMAPI
-STATIC int
-xfs_vm_fault(
-	struct vm_area_struct	*vma,
-	struct vm_fault	*vmf)
-{
-	struct inode	*inode = vma->vm_file->f_path.dentry->d_inode;
-	bhv_vnode_t	*vp = vn_from_inode(inode);
-
-	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0))
-		return VM_FAULT_SIGBUS;
-	return filemap_fault(vma, vmf);
-}
-#endif /* CONFIG_XFS_DMAPI */
-
 /*
  * Unfortunately we can't just use the clean and simple readdir implementation
  * below, because nfs might call back into ->lookup from the filldir callback
@@ -386,11 +367,6 @@ xfs_file_mmap(
 	vma->vm_ops = &xfs_file_vm_ops;
 	vma->vm_flags |= VM_CAN_NONLINEAR;
 
-#ifdef CONFIG_XFS_DMAPI
-	if (XFS_M(filp->f_path.dentry->d_inode->i_sb)->m_flags & XFS_MOUNT_DMAPI)
-		vma->vm_ops = &xfs_dmapi_file_vm_ops;
-#endif /* CONFIG_XFS_DMAPI */
-
 	file_accessed(filp);
 	return 0;
 }
@@ -437,47 +413,6 @@ xfs_file_ioctl_invis(
 	return error;
 }
 
-#ifdef CONFIG_XFS_DMAPI
-#ifdef HAVE_VMOP_MPROTECT
-STATIC int
-xfs_vm_mprotect(
-	struct vm_area_struct *vma,
-	unsigned int	newflags)
-{
-	struct inode	*inode = vma->vm_file->f_path.dentry->d_inode;
-	struct xfs_mount *mp = XFS_M(inode->i_sb);
-	int		error = 0;
-
-	if (mp->m_flags & XFS_MOUNT_DMAPI) {
-		if ((vma->vm_flags & VM_MAYSHARE) &&
-		    (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE))
-			error = XFS_SEND_MMAP(mp, vma, VM_WRITE);
-	}
-	return error;
-}
-#endif /* HAVE_VMOP_MPROTECT */
-#endif /* CONFIG_XFS_DMAPI */
-
-#ifdef HAVE_FOP_OPEN_EXEC
-/* If the user is attempting to execute a file that is offline then
- * we have to trigger a DMAPI READ event before the file is marked as busy
- * otherwise the invisible I/O will not be able to write to the file to bring
- * it back online.
- */
-STATIC int
-xfs_file_open_exec(
-	struct inode	*inode)
-{
-	struct xfs_mount *mp = XFS_M(inode->i_sb);
-	struct xfs_inode *ip = XFS_I(inode);
-
-	if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
-	             DM_EVENT_ENABLED(ip, DM_EVENT_READ))
-		return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
-	return 0;
-}
-#endif /* HAVE_FOP_OPEN_EXEC */
-
 /*
  * mmap()d file has taken write protection fault and is being made
  * writable. We can set the page state up correctly for a writable
@@ -546,13 +481,3 @@ static struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= xfs_vm_page_mkwrite,
 };
-
-#ifdef CONFIG_XFS_DMAPI
-static struct vm_operations_struct xfs_dmapi_file_vm_ops = {
-	.fault		= xfs_vm_fault,
-	.page_mkwrite	= xfs_vm_page_mkwrite,
-#ifdef HAVE_VMOP_MPROTECT
-	.mprotect	= xfs_vm_mprotect,
-#endif
-};
-#endif /* CONFIG_XFS_DMAPI */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ddb86b73c6..a42ba9d7115 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -238,7 +238,7 @@ xfs_vget_fsop_handlereq(
 		return error;
 	if (ip == NULL)
 		return XFS_ERROR(EIO);
-	if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
+	if (ip->i_d.di_gen != igen) {
 		xfs_iput_new(ip, XFS_ILOCK_SHARED);
 		return XFS_ERROR(ENOENT);
 	}
@@ -505,14 +505,14 @@ xfs_attrmulti_attr_get(
 {
 	char			*kbuf;
 	int			error = EFAULT;
-	
+
 	if (*len > XATTR_SIZE_MAX)
 		return EINVAL;
 	kbuf = kmalloc(*len, GFP_KERNEL);
 	if (!kbuf)
 		return ENOMEM;
 
-	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags, NULL);
+	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
 	if (error)
 		goto out_kfree;
 
@@ -546,7 +546,7 @@ xfs_attrmulti_attr_set(
 
 	if (copy_from_user(kbuf, ubuf, len))
 		goto out_kfree;
-			
+
 	error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
 
  out_kfree:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index a1237dad643..2bf287ef548 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -511,7 +511,8 @@ xfs_vn_rename(
 	xfs_dentry_to_name(&nname, ndentry);
 
 	error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-							XFS_I(ndir), &nname);
+			   XFS_I(ndir), &nname, new_inode ?
+			   			XFS_I(new_inode) : NULL);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index e5143323e71..4edc46915b5 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -75,6 +75,7 @@
 #include <linux/delay.h>
 #include <linux/log2.h>
 #include <linux/spinlock.h>
+#include <linux/random.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -99,7 +100,6 @@
 /*
  * Feature macros (disable/enable)
  */
-#define HAVE_SPLICE	/* a splice(2) exists in 2.6, but not in 2.4 */
 #ifdef CONFIG_SMP
 #define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
 #else
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1ebd8004469..5e3b57516ec 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -394,7 +394,7 @@ xfs_zero_last_block(
 	int		error = 0;
 	xfs_bmbt_irec_t	imap;
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
 	if (zero_offset == 0) {
@@ -425,14 +425,14 @@ xfs_zero_last_block(
 	 * out sync.  We need to drop the ilock while we do this so we
 	 * don't deadlock when the buffer cache calls back to us.
 	 */
-	xfs_iunlock(ip, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	zero_len = mp->m_sb.sb_blocksize - zero_offset;
 	if (isize + zero_len > offset)
 		zero_len = offset - isize;
 	error = xfs_iozero(ip, isize, zero_len);
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	ASSERT(error >= 0);
 	return error;
 }
@@ -465,8 +465,7 @@ xfs_zero_eof(
 	int		error = 0;
 	xfs_bmbt_irec_t	imap;
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
-	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 	ASSERT(offset > isize);
 
 	/*
@@ -475,8 +474,7 @@ xfs_zero_eof(
 	 */
 	error = xfs_zero_last_block(ip, offset, isize);
 	if (error) {
-		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
-		ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 		return error;
 	}
 
@@ -507,8 +505,7 @@ xfs_zero_eof(
 		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
 				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
 		if (error) {
-			ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
-			ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 			return error;
 		}
 		ASSERT(nimaps > 0);
@@ -532,7 +529,7 @@ xfs_zero_eof(
 		 * Drop the inode lock while we're doing the I/O.
 		 * We'll still have the iolock to protect us.
 		 */
-		xfs_iunlock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
 		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
@@ -548,13 +545,13 @@ xfs_zero_eof(
 		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 
-		xfs_ilock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
 	}
 
 	return 0;
 
 out_lock:
-	xfs_ilock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	ASSERT(error >= 0);
 	return error;
 }
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e1d498b4ba7..e6be37dbd0e 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -50,7 +50,6 @@ struct xfs_iomap;
 #define	XFS_INVAL_CACHED	18
 #define	XFS_DIORD_ENTER		19
 #define	XFS_DIOWR_ENTER		20
-#define	XFS_SENDFILE_ENTER	21
 #define	XFS_WRITEPAGE_ENTER	22
 #define	XFS_RELEASEPAGE_ENTER	23
 #define	XFS_INVALIDPAGE_ENTER	24
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 865eb708aa9..742b2c7852c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1181,7 +1181,7 @@ xfs_fs_statfs(
 	statp->f_fsid.val[0] = (u32)id;
 	statp->f_fsid.val[1] = (u32)(id >> 32);
 
-	xfs_icsb_sync_counters_flags(mp, XFS_ICSB_LAZY_COUNT);
+	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
 
 	spin_lock(&mp->m_sb_lock);
 	statp->f_bsize = sbp->sb_blocksize;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 8b4d63ce869..9d73cb5c0fc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -25,12 +25,6 @@ struct attrlist_cursor_kern;
 
 typedef struct inode	bhv_vnode_t;
 
-#define VN_ISLNK(vp)	S_ISLNK((vp)->i_mode)
-#define VN_ISREG(vp)	S_ISREG((vp)->i_mode)
-#define VN_ISDIR(vp)	S_ISDIR((vp)->i_mode)
-#define VN_ISCHR(vp)	S_ISCHR((vp)->i_mode)
-#define VN_ISBLK(vp)	S_ISBLK((vp)->i_mode)
-
 /*
  * Vnode to Linux inode mapping.
  */
@@ -151,24 +145,6 @@ typedef struct bhv_vattr {
 		XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
 		XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
 
-/*
- *  Modes.
- */
-#define VSUID	S_ISUID		/* set user id on execution */
-#define VSGID	S_ISGID		/* set group id on execution */
-#define VSVTX	S_ISVTX		/* save swapped text even after use */
-#define VREAD	S_IRUSR		/* read, write, execute permissions */
-#define VWRITE	S_IWUSR
-#define VEXEC	S_IXUSR
-
-#define MODEMASK S_IALLUGO	/* mode bits plus permission bits */
-
-/*
- * Check whether mandatory file locking is enabled.
- */
-#define MANDLOCK(vp, mode)	\
-	(VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
-
 extern void	vn_init(void);
 extern int	vn_revalidate(bhv_vnode_t *);
 
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 631ebb31b29..85df3288efd 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -933,7 +933,7 @@ xfs_qm_dqget(
 	       type == XFS_DQ_PROJ ||
 	       type == XFS_DQ_GROUP);
 	if (ip) {
-		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 		if (type == XFS_DQ_USER)
 			ASSERT(ip->i_udquot == NULL);
 		else
@@ -1088,7 +1088,7 @@ xfs_qm_dqget(
 	xfs_qm_mplist_unlock(mp);
 	XFS_DQ_HASH_UNLOCK(h);
  dqret:
-	ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	xfs_dqtrace_entry(dqp, "DQGET DONE");
 	*O_dqpp = dqp;
 	return (0);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 40ea5640956..d31cce1165c 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -670,7 +670,7 @@ xfs_qm_dqattach_one(
 	xfs_dquot_t	*dqp;
 	int		error;
 
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	error = 0;
 	/*
 	 * See if we already have it in the inode itself. IO_idqpp is
@@ -874,7 +874,7 @@ xfs_qm_dqattach(
 		return 0;
 
 	ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
-	       XFS_ISLOCKED_INODE_EXCL(ip));
+	       xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	if (! (flags & XFS_QMOPT_ILOCKED))
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -888,7 +888,8 @@ xfs_qm_dqattach(
 			goto done;
 		nquotas++;
 	}
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	if (XFS_IS_OQUOTA_ON(mp)) {
 		error = XFS_IS_GQUOTA_ON(mp) ?
 			xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
@@ -913,7 +914,7 @@ xfs_qm_dqattach(
 	 * This WON'T, in general, result in a thrash.
 	 */
 	if (nquotas == 2) {
-		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 		ASSERT(ip->i_udquot);
 		ASSERT(ip->i_gdquot);
 
@@ -956,7 +957,7 @@ xfs_qm_dqattach(
 
 #ifdef QUOTADEBUG
 	else
-		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 #endif
 	return error;
 }
@@ -1291,7 +1292,7 @@ xfs_qm_dqget_noattach(
 	xfs_mount_t	*mp;
 	xfs_dquot_t	*udqp, *gdqp;
 
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	mp = ip->i_mount;
 	udqp = NULL;
 	gdqp = NULL;
@@ -1392,7 +1393,7 @@ xfs_qm_qino_alloc(
 	 * Keep an extra reference to this quota inode. This inode is
 	 * locked exclusively and joined to the transaction already.
 	 */
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
+	ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
 	VN_HOLD(XFS_ITOV((*ip)));
 
 	/*
@@ -1737,12 +1738,6 @@ xfs_qm_dqusage_adjust(
 		return error;
 	}
 
-	if (ip->i_d.di_mode == 0) {
-		xfs_iput_new(ip, XFS_ILOCK_EXCL);
-		*res = BULKSTAT_RV_NOTHING;
-		return XFS_ERROR(ENOENT);
-	}
-
 	/*
 	 * Obtain the locked dquots. In case of an error (eg. allocation
 	 * fails for ENOSPC), we return the negative of the error number
@@ -2563,7 +2558,7 @@ xfs_qm_vop_chown(
 	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
 				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
 
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
 
 	/* old dquot */
@@ -2607,7 +2602,7 @@ xfs_qm_vop_chown_reserve(
 	uint		delblks, blkflags, prjflags = 0;
 	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
 
-	ASSERT(XFS_ISLOCKED_INODE(ip));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	mp = ip->i_mount;
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
@@ -2717,7 +2712,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
 	if (!XFS_IS_QUOTA_ON(tp->t_mountp))
 		return;
 
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
 
 	if (udqp) {
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 8342823dbdc..768a3b27d2b 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1366,12 +1366,6 @@ xfs_qm_internalqcheck_adjust(
 		return (error);
 	}
 
-	if (ip->i_d.di_mode == 0) {
-		xfs_iput_new(ip, lock_flags);
-		*res = BULKSTAT_RV_NOTHING;
-		return XFS_ERROR(ENOENT);
-	}
-
 	/*
 	 * This inode can have blocks after eof which can get released
 	 * when we send it to inactive. Since we don't check the dquot
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index a8b85e2be9d..5e4a40b1c56 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -27,11 +27,6 @@
 /* Number of dquots that fit in to a dquot block */
 #define XFS_QM_DQPERBLK(mp)	((mp)->m_quotainfo->qi_dqperchunk)
 
-#define XFS_ISLOCKED_INODE(ip)		(ismrlocked(&(ip)->i_lock, \
-					    MR_UPDATE | MR_ACCESS) != 0)
-#define XFS_ISLOCKED_INODE_EXCL(ip)	(ismrlocked(&(ip)->i_lock, \
-					    MR_UPDATE) != 0)
-
 #define XFS_DQ_IS_ADDEDTO_TRX(t, d)	((d)->q_transp == (t))
 
 #define XFS_QI_MPLRECLAIMS(mp)	((mp)->m_quotainfo->qi_dqreclaims)
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index f441f836ca8..99611381e74 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -834,7 +834,7 @@ xfs_trans_reserve_quota_nblks(
 	ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
 	ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
 
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
 	ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
 				XFS_TRANS_DQ_RES_RTBLKS ||
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 855da040864..75845f95081 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -49,8 +49,6 @@ extern void assfail(char *expr, char *f, int l);
 
 #else /* DEBUG */
 
-#include <linux/random.h>
-
 #define ASSERT(expr)	\
 	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 765aaf65e2d..540e4c98982 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
 #define STATIC
 #define DEBUG 1
 #define XFS_BUF_LOCK_TRACKING 1
-#define QUOTADEBUG 1
+/* #define QUOTADEBUG 1 */
 #endif
 
 #ifdef CONFIG_XFS_TRACE
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8e130b9720a..ebee3a4f703 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -72,7 +72,7 @@ xfs_acl_vhasacl_default(
 {
 	int		error;
 
-	if (!VN_ISDIR(vp))
+	if (!S_ISDIR(vp->i_mode))
 		return 0;
 	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
 	return (error == 0);
@@ -238,15 +238,8 @@ xfs_acl_vget(
 			error = EINVAL;
 			goto out;
 		}
-		if (kind == _ACL_TYPE_ACCESS) {
-			bhv_vattr_t	va;
-
-			va.va_mask = XFS_AT_MODE;
-			error = xfs_getattr(xfs_vtoi(vp), &va, 0);
-			if (error)
-				goto out;
-			xfs_acl_sync_mode(va.va_mode, xfs_acl);
-		}
+		if (kind == _ACL_TYPE_ACCESS)
+			xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl);
 		error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
 	}
 out:
@@ -341,14 +334,15 @@ xfs_acl_iaccess(
 {
 	xfs_acl_t	*acl;
 	int		rval;
+	struct xfs_name	acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE};
 
 	if (!(_ACL_ALLOC(acl)))
 		return -1;
 
 	/* If the file has no ACL return -1. */
 	rval = sizeof(xfs_acl_t);
-	if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE,
-			(char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) {
+	if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval,
+					ATTR_ROOT | ATTR_KERNACCESS)) {
 		_ACL_FREE(acl);
 		return -1;
 	}
@@ -373,23 +367,15 @@ xfs_acl_allow_set(
 	bhv_vnode_t	*vp,
 	int		kind)
 {
-	xfs_inode_t	*ip = xfs_vtoi(vp);
-	bhv_vattr_t	va;
-	int		error;
-
 	if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
 		return EPERM;
-	if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp))
+	if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode))
 		return ENOTDIR;
 	if (vp->i_sb->s_flags & MS_RDONLY)
 		return EROFS;
-	va.va_mask = XFS_AT_UID;
-	error = xfs_getattr(ip, &va, 0);
-	if (error)
-		return error;
-	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
+	if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
 		return EPERM;
-	return error;
+	return 0;
 }
 
 /*
@@ -594,7 +580,7 @@ xfs_acl_get_attr(
 	*error = xfs_attr_get(xfs_vtoi(vp),
 					kind == _ACL_TYPE_ACCESS ?
 					SGI_ACL_FILE : SGI_ACL_DEFAULT,
-					(char *)aclp, &len, flags, sys_cred);
+					(char *)aclp, &len, flags);
 	if (*error || (flags & ATTR_KERNOVAL))
 		return;
 	xfs_acl_get_endian(aclp);
@@ -643,7 +629,6 @@ xfs_acl_vtoacl(
 	xfs_acl_t	*access_acl,
 	xfs_acl_t	*default_acl)
 {
-	bhv_vattr_t	va;
 	int		error = 0;
 
 	if (access_acl) {
@@ -652,16 +637,10 @@ xfs_acl_vtoacl(
 		 * be obtained for some reason, invalidate the access ACL.
 		 */
 		xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
-		if (!error) {
-			/* Got the ACL, need the mode... */
-			va.va_mask = XFS_AT_MODE;
-			error = xfs_getattr(xfs_vtoi(vp), &va, 0);
-		}
-
 		if (error)
 			access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
 		else /* We have a good ACL and the file mode, synchronize. */
-			xfs_acl_sync_mode(va.va_mode, access_acl);
+			xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl);
 	}
 
 	if (default_acl) {
@@ -719,7 +698,7 @@ xfs_acl_inherit(
 	 * If the new file is a directory, its default ACL is a copy of
 	 * the containing directory's default ACL.
 	 */
-	if (VN_ISDIR(vp))
+	if (S_ISDIR(vp->i_mode))
 		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
 	if (!error && !basicperms)
 		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
@@ -744,7 +723,7 @@ xfs_acl_setmode(
 	bhv_vattr_t	va;
 	xfs_acl_entry_t	*ap;
 	xfs_acl_entry_t	*gap = NULL;
-	int		i, error, nomask = 1;
+	int		i, nomask = 1;
 
 	*basicperms = 1;
 
@@ -756,11 +735,7 @@ xfs_acl_setmode(
 	 * mode.  The m:: bits take precedence over the g:: bits.
 	 */
 	va.va_mask = XFS_AT_MODE;
-	error = xfs_getattr(xfs_vtoi(vp), &va, 0);
-	if (error)
-		return error;
-
-	va.va_mask = XFS_AT_MODE;
+	va.va_mode = xfs_vtoi(vp)->i_d.di_mode;
 	va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
 	ap = acl->acl_entry;
 	for (i = 0; i < acl->acl_cnt; ++i) {
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 36d781ee5fc..df151a85918 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -101,14 +101,28 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
 ktrace_t *xfs_attr_trace_buf;
 #endif
 
+STATIC int
+xfs_attr_name_to_xname(
+	struct xfs_name	*xname,
+	const char	*aname)
+{
+	if (!aname)
+		return EINVAL;
+	xname->name = aname;
+	xname->len = strlen(aname);
+	if (xname->len >= MAXNAMELEN)
+		return EFAULT;		/* match IRIX behaviour */
+
+	return 0;
+}
 
 /*========================================================================
  * Overall external interface routines.
  *========================================================================*/
 
 int
-xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen,
-	       char *value, int *valuelenp, int flags, struct cred *cred)
+xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
+		char *value, int *valuelenp, int flags)
 {
 	xfs_da_args_t   args;
 	int             error;
@@ -122,8 +136,8 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen,
 	 * Fill in the arg structure for this request.
 	 */
 	memset((char *)&args, 0, sizeof(args));
-	args.name = name;
-	args.namelen = namelen;
+	args.name = name->name;
+	args.namelen = name->len;
 	args.value = value;
 	args.valuelen = *valuelenp;
 	args.flags = flags;
@@ -162,31 +176,29 @@ xfs_attr_get(
 	const char	*name,
 	char		*value,
 	int		*valuelenp,
-	int		flags,
-	cred_t		*cred)
+	int		flags)
 {
-	int		error, namelen;
+	int		error;
+	struct xfs_name	xname;
 
 	XFS_STATS_INC(xs_attr_get);
 
-	if (!name)
-		return(EINVAL);
-	namelen = strlen(name);
-	if (namelen >= MAXNAMELEN)
-		return(EFAULT);		/* match IRIX behaviour */
-
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return(EIO);
 
+	error = xfs_attr_name_to_xname(&xname, name);
+	if (error)
+		return error;
+
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred);
+	error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	return(error);
 }
 
-int
-xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
-		 char *value, int valuelen, int flags)
+STATIC int
+xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
+		char *value, int valuelen, int flags)
 {
 	xfs_da_args_t	args;
 	xfs_fsblock_t	firstblock;
@@ -209,7 +221,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
 	 */
 	if (XFS_IFORK_Q(dp) == 0) {
 		int sf_size = sizeof(xfs_attr_sf_hdr_t) +
-			      XFS_ATTR_SF_ENTSIZE_BYNAME(namelen, valuelen);
+			      XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen);
 
 		if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd)))
 			return(error);
@@ -219,8 +231,8 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
 	 * Fill in the arg structure for this request.
 	 */
 	memset((char *)&args, 0, sizeof(args));
-	args.name = name;
-	args.namelen = namelen;
+	args.name = name->name;
+	args.namelen = name->len;
 	args.value = value;
 	args.valuelen = valuelen;
 	args.flags = flags;
@@ -236,7 +248,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
 	 * Determine space new attribute will use, and if it would be
 	 * "local" or "remote" (note: local != inline).
 	 */
-	size = xfs_attr_leaf_newentsize(namelen, valuelen,
+	size = xfs_attr_leaf_newentsize(name->len, valuelen,
 					mp->m_sb.sb_blocksize, &local);
 
 	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
@@ -429,26 +441,27 @@ xfs_attr_set(
 	int		valuelen,
 	int		flags)
 {
-	int             namelen;
-
-	namelen = strlen(name);
-	if (namelen >= MAXNAMELEN)
-		return EFAULT;		/* match IRIX behaviour */
+	int             error;
+	struct xfs_name	xname;
 
 	XFS_STATS_INC(xs_attr_set);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return (EIO);
 
-	return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags);
+	error = xfs_attr_name_to_xname(&xname, name);
+	if (error)
+		return error;
+
+	return xfs_attr_set_int(dp, &xname, value, valuelen, flags);
 }
 
 /*
  * Generic handler routine to remove a name from an attribute list.
  * Transitions attribute list from Btree to shortform as necessary.
  */
-int
-xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
+STATIC int
+xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 {
 	xfs_da_args_t	args;
 	xfs_fsblock_t	firstblock;
@@ -460,8 +473,8 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
 	 * Fill in the arg structure for this request.
 	 */
 	memset((char *)&args, 0, sizeof(args));
-	args.name = name;
-	args.namelen = namelen;
+	args.name = name->name;
+	args.namelen = name->len;
 	args.flags = flags;
 	args.hashval = xfs_da_hashname(args.name, args.namelen);
 	args.dp = dp;
@@ -575,17 +588,18 @@ xfs_attr_remove(
 	const char	*name,
 	int		flags)
 {
-	int		namelen;
-
-	namelen = strlen(name);
-	if (namelen >= MAXNAMELEN)
-		return EFAULT;		/* match IRIX behaviour */
+	int		error;
+	struct xfs_name	xname;
 
 	XFS_STATS_INC(xs_attr_remove);
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return (EIO);
 
+	error = xfs_attr_name_to_xname(&xname, name);
+	if (error)
+		return error;
+
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
 	if (XFS_IFORK_Q(dp) == 0 ||
 		   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
@@ -595,10 +609,10 @@ xfs_attr_remove(
 	}
 	xfs_iunlock(dp, XFS_ILOCK_SHARED);
 
-	return xfs_attr_remove_int(dp, name, namelen, flags);
+	return xfs_attr_remove_int(dp, &xname, flags);
 }
 
-int								/* error */
+STATIC int
 xfs_attr_list_int(xfs_attr_list_context_t *context)
 {
 	int error;
@@ -2522,8 +2536,7 @@ attr_generic_get(
 {
 	int	error, asize = size;
 
-	error = xfs_attr_get(xfs_vtoi(vp), name, data,
-				    &asize, xflags, NULL);
+	error = xfs_attr_get(xfs_vtoi(vp), name, data, &asize, xflags);
 	if (!error)
 		return asize;
 	return -error;
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 786eba3121c..6cfc9384fe3 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -158,14 +158,10 @@ struct xfs_da_args;
 /*
  * Overall external interface routines.
  */
-int xfs_attr_set_int(struct xfs_inode *, const char *, int, char *, int, int);
-int xfs_attr_remove_int(struct xfs_inode *, const char *, int, int);
-int xfs_attr_list_int(struct xfs_attr_list_context *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 
 int xfs_attr_shortform_getvalue(struct xfs_da_args *);
-int xfs_attr_fetch(struct xfs_inode *, const char *, int,
-			char *, int *, int, struct cred *);
+int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
 
 #endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index eb198c01c35..53c259f5a5a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4074,7 +4074,6 @@ xfs_bmap_add_attrfork(
 error2:
 	xfs_bmap_cancel(&flist);
 error1:
-	ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE));
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3f53fad356a..5f3647cb988 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -162,7 +162,7 @@ xfs_swap_extents(
 		ips[1] = ip;
 	}
 
-	xfs_lock_inodes(ips, 2, 0, lock_flags);
+	xfs_lock_inodes(ips, 2, lock_flags);
 	locked = 1;
 
 	/* Verify that both files have the same format */
@@ -265,7 +265,7 @@ xfs_swap_extents(
 		locked = 0;
 		goto error0;
 	}
-	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+	xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
 
 	/*
 	 * Count the number of extended attribute blocks
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d3a0f538d6a..381ebda4f7b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -462,7 +462,7 @@ xfs_fs_counts(
 	xfs_mount_t		*mp,
 	xfs_fsop_counts_t	*cnt)
 {
-	xfs_icsb_sync_counters_flags(mp, XFS_ICSB_LAZY_COUNT);
+	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
 	spin_lock(&mp->m_sb_lock);
 	cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 	cnt->freertx = mp->m_sb.sb_frextents;
@@ -524,7 +524,7 @@ xfs_reserve_blocks(
 	 */
 retry:
 	spin_lock(&mp->m_sb_lock);
-	xfs_icsb_sync_counters_flags(mp, XFS_ICSB_SB_LOCKED);
+	xfs_icsb_sync_counters_locked(mp, 0);
 
 	/*
 	 * If our previous reservation was larger than the current value,
@@ -552,11 +552,8 @@ retry:
 			mp->m_resblks += free;
 			mp->m_resblks_avail += free;
 			fdblks_delta = -free;
-			mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp);
 		} else {
 			fdblks_delta = -delta;
-			mp->m_sb.sb_fdblocks =
-				lcounter + XFS_ALLOC_SET_ASIDE(mp);
 			mp->m_resblks = request;
 			mp->m_resblks_avail += delta;
 		}
@@ -587,7 +584,6 @@ out:
 		if (error == ENOSPC)
 			goto retry;
 	}
-
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index a64dfbd565a..aad8c5da38a 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -147,6 +147,7 @@ xfs_ialloc_ag_alloc(
 	int		version;	/* inode version number to use */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
+	unsigned int	gen;
 
 	args.tp = tp;
 	args.mp = tp->t_mountp;
@@ -290,6 +291,14 @@ xfs_ialloc_ag_alloc(
 	else
 		version = XFS_DINODE_VERSION_1;
 
+	/*
+	 * Seed the new inode cluster with a random generation number. This
+	 * prevents short-term reuse of generation numbers if a chunk is
+	 * freed and then immediately reallocated. We use random numbers
+	 * rather than a linear progression to prevent the next generation
+	 * number from being easily guessable.
+	 */
+	gen = random32();
 	for (j = 0; j < nbufs; j++) {
 		/*
 		 * Get the block.
@@ -309,6 +318,7 @@ xfs_ialloc_ag_alloc(
 			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
 			free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 			free->di_core.di_version = version;
+			free->di_core.di_gen = cpu_to_be32(gen);
 			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 			xfs_ialloc_log_di(tp, fbuf, i,
 				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e657c512846..b07604b94d9 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -593,8 +593,9 @@ xfs_iunlock_map_shared(
  *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
  */
 void
-xfs_ilock(xfs_inode_t	*ip,
-	  uint		lock_flags)
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
 {
 	/*
 	 * You can't set both SHARED and EXCL for the same lock,
@@ -607,16 +608,16 @@ xfs_ilock(xfs_inode_t	*ip,
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 
-	if (lock_flags & XFS_IOLOCK_EXCL) {
+	if (lock_flags & XFS_IOLOCK_EXCL)
 		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+	else if (lock_flags & XFS_IOLOCK_SHARED)
 		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	}
-	if (lock_flags & XFS_ILOCK_EXCL) {
+
+	if (lock_flags & XFS_ILOCK_EXCL)
 		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-	} else if (lock_flags & XFS_ILOCK_SHARED) {
+	else if (lock_flags & XFS_ILOCK_SHARED)
 		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-	}
+
 	xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
 }
 
@@ -631,15 +632,12 @@ xfs_ilock(xfs_inode_t	*ip,
  * lock_flags -- this parameter indicates the inode's locks to be
  *       to be locked.  See the comment for xfs_ilock() for a list
  *	 of valid values.
- *
  */
 int
-xfs_ilock_nowait(xfs_inode_t	*ip,
-		 uint		lock_flags)
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
 {
-	int	iolocked;
-	int	ilocked;
-
 	/*
 	 * You can't set both SHARED and EXCL for the same lock,
 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
@@ -651,37 +649,30 @@ xfs_ilock_nowait(xfs_inode_t	*ip,
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 
-	iolocked = 0;
 	if (lock_flags & XFS_IOLOCK_EXCL) {
-		iolocked = mrtryupdate(&ip->i_iolock);
-		if (!iolocked) {
-			return 0;
-		}
+		if (!mrtryupdate(&ip->i_iolock))
+			goto out;
 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		iolocked = mrtryaccess(&ip->i_iolock);
-		if (!iolocked) {
-			return 0;
-		}
+		if (!mrtryaccess(&ip->i_iolock))
+			goto out;
 	}
 	if (lock_flags & XFS_ILOCK_EXCL) {
-		ilocked = mrtryupdate(&ip->i_lock);
-		if (!ilocked) {
-			if (iolocked) {
-				mrunlock(&ip->i_iolock);
-			}
-			return 0;
-		}
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_iolock;
 	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		ilocked = mrtryaccess(&ip->i_lock);
-		if (!ilocked) {
-			if (iolocked) {
-				mrunlock(&ip->i_iolock);
-			}
-			return 0;
-		}
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_iolock;
 	}
 	xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
 	return 1;
+
+ out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+ out:
+	return 0;
 }
 
 /*
@@ -697,8 +688,9 @@ xfs_ilock_nowait(xfs_inode_t	*ip,
  *
  */
 void
-xfs_iunlock(xfs_inode_t	*ip,
-	    uint	lock_flags)
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
 {
 	/*
 	 * You can't set both SHARED and EXCL for the same lock,
@@ -713,31 +705,25 @@ xfs_iunlock(xfs_inode_t	*ip,
 			XFS_LOCK_DEP_MASK)) == 0);
 	ASSERT(lock_flags != 0);
 
-	if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
-		ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) ||
-		       (ismrlocked(&ip->i_iolock, MR_ACCESS)));
-		ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) ||
-		       (ismrlocked(&ip->i_iolock, MR_UPDATE)));
-		mrunlock(&ip->i_iolock);
-	}
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
 
-	if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) {
-		ASSERT(!(lock_flags & XFS_ILOCK_SHARED) ||
-		       (ismrlocked(&ip->i_lock, MR_ACCESS)));
-		ASSERT(!(lock_flags & XFS_ILOCK_EXCL) ||
-		       (ismrlocked(&ip->i_lock, MR_UPDATE)));
-		mrunlock(&ip->i_lock);
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
 
+	if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
+	    !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
 		/*
 		 * Let the AIL know that this item has been unlocked in case
 		 * it is in the AIL and anyone is waiting on it.  Don't do
 		 * this if the caller has asked us not to.
 		 */
-		if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) &&
-		     ip->i_itemp != NULL) {
-			xfs_trans_unlocked_item(ip->i_mount,
-						(xfs_log_item_t*)(ip->i_itemp));
-		}
+		xfs_trans_unlocked_item(ip->i_mount,
+					(xfs_log_item_t*)(ip->i_itemp));
 	}
 	xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
 }
@@ -747,21 +733,47 @@ xfs_iunlock(xfs_inode_t	*ip,
  * if it is being demoted.
  */
 void
-xfs_ilock_demote(xfs_inode_t	*ip,
-		 uint		lock_flags)
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
 {
 	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 
-	if (lock_flags & XFS_ILOCK_EXCL) {
-		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	if (lock_flags & XFS_ILOCK_EXCL)
 		mrdemote(&ip->i_lock);
-	}
-	if (lock_flags & XFS_IOLOCK_EXCL) {
-		ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+	if (lock_flags & XFS_IOLOCK_EXCL)
 		mrdemote(&ip->i_iolock);
+}
+
+#ifdef DEBUG
+/*
+ * Debug-only routine, without additional rw_semaphore APIs, we can
+ * now only answer requests regarding whether we hold the lock for write
+ * (reader state is outside our visibility, we only track writer state).
+ *
+ * Note: this means !xfs_isilocked would give false positives, so don't do that.
+ */
+int
+xfs_isilocked(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
+			XFS_ILOCK_EXCL) {
+		if (!ip->i_lock.mr_writer)
+			return 0;
 	}
+
+	if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
+			XFS_IOLOCK_EXCL) {
+		if (!ip->i_iolock.mr_writer)
+			return 0;
+	}
+
+	return 1;
 }
+#endif
 
 /*
  * The following three routines simply manage the i_flock
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ca12acb9039..cf0bb9c1d62 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1291,7 +1291,7 @@ xfs_file_last_byte(
 	xfs_fileoff_t	size_last_block;
 	int		error;
 
-	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
 
 	mp = ip->i_mount;
 	/*
@@ -1402,7 +1402,7 @@ xfs_itruncate_start(
 	bhv_vnode_t	*vp;
 	int		error = 0;
 
-	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT((new_size == 0) || (new_size <= ip->i_size));
 	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
 	       (flags == XFS_ITRUNC_MAYBE));
@@ -1528,8 +1528,7 @@ xfs_itruncate_finish(
 	xfs_bmap_free_t	free_list;
 	int		error;
 
-	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 	ASSERT((new_size == 0) || (new_size <= ip->i_size));
 	ASSERT(*tp != NULL);
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1780,8 +1779,7 @@ xfs_igrow_start(
 	xfs_fsize_t	new_size,
 	cred_t		*credp)
 {
-	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
-	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 	ASSERT(new_size > ip->i_size);
 
 	/*
@@ -1809,8 +1807,7 @@ xfs_igrow_finish(
 	xfs_fsize_t	new_size,
 	int		change_flag)
 {
-	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
-	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 	ASSERT(ip->i_transp == tp);
 	ASSERT(new_size > ip->i_size);
 
@@ -2287,7 +2284,7 @@ xfs_ifree(
 	xfs_dinode_t    	*dip;
 	xfs_buf_t       	*ibp;
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(ip->i_transp == tp);
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
@@ -2746,7 +2743,7 @@ void
 xfs_ipin(
 	xfs_inode_t	*ip)
 {
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	atomic_inc(&ip->i_pincount);
 }
@@ -2779,7 +2776,7 @@ __xfs_iunpin_wait(
 {
 	xfs_inode_log_item_t	*iip = ip->i_itemp;
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	if (atomic_read(&ip->i_pincount) == 0)
 		return;
 
@@ -2829,7 +2826,7 @@ xfs_iextents_copy(
 	xfs_fsblock_t		start_block;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(ifp->if_bytes > 0);
 
 	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
@@ -3132,7 +3129,7 @@ xfs_iflush(
 
 	XFS_STATS_INC(xs_iflush_count);
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(issemalocked(&(ip->i_flock)));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3297,7 +3294,7 @@ xfs_iflush_int(
 	int			first;
 #endif
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(issemalocked(&(ip->i_flock)));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 93c37697a72..0a999fee4f0 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -386,20 +386,9 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define	XFS_ILOCK_EXCL		(1<<2)
 #define	XFS_ILOCK_SHARED	(1<<3)
 #define	XFS_IUNLOCK_NONOTIFY	(1<<4)
-/*	#define XFS_IOLOCK_NESTED	(1<<5)	*/
-#define XFS_EXTENT_TOKEN_RD	(1<<6)
-#define XFS_SIZE_TOKEN_RD	(1<<7)
-#define XFS_EXTSIZE_RD		(XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD)
-#define XFS_WILLLEND		(1<<8)	/* Always acquire tokens for lending */
-#define XFS_EXTENT_TOKEN_WR	(XFS_EXTENT_TOKEN_RD | XFS_WILLLEND)
-#define XFS_SIZE_TOKEN_WR       (XFS_SIZE_TOKEN_RD | XFS_WILLLEND)
-#define XFS_EXTSIZE_WR		(XFS_EXTSIZE_RD | XFS_WILLLEND)
-/* TODO:XFS_SIZE_TOKEN_WANT	(1<<9) */
 
 #define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
-				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
-				| XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD \
-				| XFS_WILLLEND)
+				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
 
 /*
  * Flags for lockdep annotations.
@@ -483,6 +472,7 @@ void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
+int		xfs_isilocked(xfs_inode_t *, uint);
 void		xfs_iflock(xfs_inode_t *);
 int		xfs_iflock_nowait(xfs_inode_t *);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
@@ -534,7 +524,7 @@ int		xfs_iflush(xfs_inode_t *, uint);
 void		xfs_iflush_all(struct xfs_mount *);
 void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
-void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
+void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 
 void		xfs_synchronize_atime(xfs_inode_t *);
 void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 93b5db453ea..167b33f1577 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -547,7 +547,7 @@ STATIC void
 xfs_inode_item_pin(
 	xfs_inode_log_item_t	*iip)
 {
-	ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
 	xfs_ipin(iip->ili_inode);
 }
 
@@ -664,13 +664,13 @@ xfs_inode_item_unlock(
 
 	ASSERT(iip != NULL);
 	ASSERT(iip->ili_inode->i_itemp != NULL);
-	ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
 	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
 		  XFS_ILI_IOLOCKED_EXCL)) ||
-	       ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE));
+	       xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
 	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
 		  XFS_ILI_IOLOCKED_SHARED)) ||
-	       ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS));
+	       xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
 	/*
 	 * Clear the transaction pointer in the inode.
 	 */
@@ -769,7 +769,7 @@ xfs_inode_item_pushbuf(
 
 	ip = iip->ili_inode;
 
-	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
 	/*
 	 * The ili_pushbuf_flag keeps others from
@@ -857,7 +857,7 @@ xfs_inode_item_push(
 
 	ip = iip->ili_inode;
 
-	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 	ASSERT(issemalocked(&(ip->i_flock)));
 	/*
 	 * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fb3cf119141..7edcde691d1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -196,14 +196,14 @@ xfs_iomap(
 		break;
 	case BMAPI_WRITE:
 		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count);
-		lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
+		lockmode = XFS_ILOCK_EXCL;
 		if (flags & BMAPI_IGNSTATE)
 			bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
 		xfs_ilock(ip, lockmode);
 		break;
 	case BMAPI_ALLOCATE:
 		xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count);
-		lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
+		lockmode = XFS_ILOCK_SHARED;
 		bmapi_flags = XFS_BMAPI_ENTIRE;
 
 		/* Attempt non-blocking lock */
@@ -523,8 +523,7 @@ xfs_iomap_write_direct(
 		goto error_out;
 	}
 
-	if (unlikely(!imap.br_startblock &&
-		     !(XFS_IS_REALTIME_INODE(ip)))) {
+	if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) {
 		error = xfs_cmn_err_fsblock_zero(ip, &imap);
 		goto error_out;
 	}
@@ -624,7 +623,7 @@ xfs_iomap_write_delay(
 	int		prealloc, fsynced = 0;
 	int		error;
 
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	/*
 	 * Make sure that the dquots are there. This doesn't hold
@@ -686,8 +685,7 @@ retry:
 		goto retry;
 	}
 
-	if (unlikely(!imap[0].br_startblock &&
-		     !(XFS_IS_REALTIME_INODE(ip))))
+	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
 		return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
 
 	*ret_imap = imap[0];
@@ -838,9 +836,9 @@ xfs_iomap_write_allocate(
 		 * See if we were able to allocate an extent that
 		 * covers at least part of the callers request
 		 */
-		if (unlikely(!imap.br_startblock &&
-			     XFS_IS_REALTIME_INODE(ip)))
+		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
 			return xfs_cmn_err_fsblock_zero(ip, &imap);
+
 		if ((offset_fsb >= imap.br_startoff) &&
 		    (offset_fsb < (imap.br_startoff +
 				   imap.br_blockcount))) {
@@ -934,8 +932,7 @@ xfs_iomap_write_unwritten(
 		if (error)
 			return XFS_ERROR(error);
 
-		if (unlikely(!imap.br_startblock &&
-			     !(XFS_IS_REALTIME_INODE(ip))))
+		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
 			return xfs_cmn_err_fsblock_zero(ip, &imap);
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index eb85bdedad0..419de15aeb4 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -71,11 +71,6 @@ xfs_bulkstat_one_iget(
 
 	ASSERT(ip != NULL);
 	ASSERT(ip->i_blkno != (xfs_daddr_t)0);
-	if (ip->i_d.di_mode == 0) {
-		*stat = BULKSTAT_RV_NOTHING;
-		error = XFS_ERROR(ENOENT);
-		goto out_iput;
-	}
 
 	vp = XFS_ITOV(ip);
 	dic = &ip->i_d;
@@ -124,7 +119,6 @@ xfs_bulkstat_one_iget(
 		break;
 	}
 
- out_iput:
 	xfs_iput(ip, XFS_ILOCK_SHARED);
 	return error;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2fec452afbc..da3988453b7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -54,8 +54,9 @@ STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 #ifdef HAVE_PERCPU_SB
 STATIC void	xfs_icsb_destroy_counters(xfs_mount_t *);
 STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
-						int, int);
-STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
+						int);
+STATIC void	xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
+						int);
 STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
 						int64_t, int);
 STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
@@ -63,8 +64,8 @@ STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
 
 #define xfs_icsb_destroy_counters(mp)			do { } while (0)
-#define xfs_icsb_balance_counter(mp, a, b, c)		do { } while (0)
-#define xfs_icsb_sync_counters(mp)			do { } while (0)
+#define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
+#define xfs_icsb_balance_counter_locked(mp, a, b)	do { } while (0)
 #define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
 
 #endif
@@ -1400,7 +1401,7 @@ xfs_log_sbcount(
 	if (!xfs_fs_writable(mp))
 		return 0;
 
-	xfs_icsb_sync_counters(mp);
+	xfs_icsb_sync_counters(mp, 0);
 
 	/*
 	 * we don't need to do this if we are updating the superblock
@@ -2026,9 +2027,9 @@ xfs_icsb_cpu_notify(
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		xfs_icsb_lock(mp);
-		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
-		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
-		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
 		xfs_icsb_unlock(mp);
 		break;
 	case CPU_DEAD:
@@ -2048,12 +2049,9 @@ xfs_icsb_cpu_notify(
 
 		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
 
-		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT,
-					 XFS_ICSB_SB_LOCKED, 0);
-		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE,
-					 XFS_ICSB_SB_LOCKED, 0);
-		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS,
-					 XFS_ICSB_SB_LOCKED, 0);
+		xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
+		xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
+		xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
 		spin_unlock(&mp->m_sb_lock);
 		xfs_icsb_unlock(mp);
 		break;
@@ -2105,9 +2103,9 @@ xfs_icsb_reinit_counters(
 	 * initial balance kicks us off correctly
 	 */
 	mp->m_icsb_counters = -1;
-	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
-	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
-	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
 	xfs_icsb_unlock(mp);
 }
 
@@ -2223,7 +2221,7 @@ xfs_icsb_disable_counter(
 	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
 		/* drain back to superblock */
 
-		xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT);
+		xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
 		switch(field) {
 		case XFS_SBS_ICOUNT:
 			mp->m_sb.sb_icount = cnt.icsb_icount;
@@ -2278,38 +2276,33 @@ xfs_icsb_enable_counter(
 }
 
 void
-xfs_icsb_sync_counters_flags(
+xfs_icsb_sync_counters_locked(
 	xfs_mount_t	*mp,
 	int		flags)
 {
 	xfs_icsb_cnts_t	cnt;
 
-	/* Pass 1: lock all counters */
-	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
-		spin_lock(&mp->m_sb_lock);
-
 	xfs_icsb_count(mp, &cnt, flags);
 
-	/* Step 3: update mp->m_sb fields */
 	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
 		mp->m_sb.sb_icount = cnt.icsb_icount;
 	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
 		mp->m_sb.sb_ifree = cnt.icsb_ifree;
 	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
 		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-
-	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
-		spin_unlock(&mp->m_sb_lock);
 }
 
 /*
  * Accurate update of per-cpu counters to incore superblock
  */
-STATIC void
+void
 xfs_icsb_sync_counters(
-	xfs_mount_t	*mp)
+	xfs_mount_t	*mp,
+	int		flags)
 {
-	xfs_icsb_sync_counters_flags(mp, 0);
+	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_sync_counters_locked(mp, flags);
+	spin_unlock(&mp->m_sb_lock);
 }
 
 /*
@@ -2332,19 +2325,15 @@ xfs_icsb_sync_counters(
 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
 		(uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
 STATIC void
-xfs_icsb_balance_counter(
+xfs_icsb_balance_counter_locked(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t  field,
-	int		flags,
 	int		min_per_cpu)
 {
 	uint64_t	count, resid;
 	int		weight = num_online_cpus();
 	uint64_t	min = (uint64_t)min_per_cpu;
 
-	if (!(flags & XFS_ICSB_SB_LOCKED))
-		spin_lock(&mp->m_sb_lock);
-
 	/* disable counter and sync counter */
 	xfs_icsb_disable_counter(mp, field);
 
@@ -2354,19 +2343,19 @@ xfs_icsb_balance_counter(
 		count = mp->m_sb.sb_icount;
 		resid = do_div(count, weight);
 		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-			goto out;
+			return;
 		break;
 	case XFS_SBS_IFREE:
 		count = mp->m_sb.sb_ifree;
 		resid = do_div(count, weight);
 		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-			goto out;
+			return;
 		break;
 	case XFS_SBS_FDBLOCKS:
 		count = mp->m_sb.sb_fdblocks;
 		resid = do_div(count, weight);
 		if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
-			goto out;
+			return;
 		break;
 	default:
 		BUG();
@@ -2375,9 +2364,17 @@ xfs_icsb_balance_counter(
 	}
 
 	xfs_icsb_enable_counter(mp, field, count, resid);
-out:
-	if (!(flags & XFS_ICSB_SB_LOCKED))
-		spin_unlock(&mp->m_sb_lock);
+}
+
+STATIC void
+xfs_icsb_balance_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t  fields,
+	int		min_per_cpu)
+{
+	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
+	spin_unlock(&mp->m_sb_lock);
 }
 
 STATIC int
@@ -2484,7 +2481,7 @@ slow_path:
 	 * we are done.
 	 */
 	if (ret != ENOSPC)
-		xfs_icsb_balance_counter(mp, field, 0, 0);
+		xfs_icsb_balance_counter(mp, field, 0);
 	xfs_icsb_unlock(mp);
 	return ret;
 
@@ -2508,7 +2505,7 @@ balance_counter:
 	 * will either succeed through the fast path or slow path without
 	 * another balance operation being required.
 	 */
-	xfs_icsb_balance_counter(mp, field, 0, delta);
+	xfs_icsb_balance_counter(mp, field, delta);
 	xfs_icsb_unlock(mp);
 	goto again;
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1ed575110ff..63e0693a358 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -206,17 +206,18 @@ typedef struct xfs_icsb_cnts {
 
 #define XFS_ICSB_FLAG_LOCK	(1 << 0)	/* counter lock bit */
 
-#define XFS_ICSB_SB_LOCKED	(1 << 0)	/* sb already locked */
 #define XFS_ICSB_LAZY_COUNT	(1 << 1)	/* accuracy not needed */
 
 extern int	xfs_icsb_init_counters(struct xfs_mount *);
 extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
-extern void	xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
+extern void	xfs_icsb_sync_counters(struct xfs_mount *, int);
+extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 
 #else
 #define xfs_icsb_init_counters(mp)	(0)
 #define xfs_icsb_reinit_counters(mp)	do { } while (0)
-#define xfs_icsb_sync_counters_flags(mp, flags)	do { } while (0)
+#define xfs_icsb_sync_counters(mp, flags)	do { } while (0)
+#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
 
 typedef struct xfs_ail {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index ee371890d85..d8063e1ad29 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -55,85 +55,32 @@ xfs_rename_unlock4(
 
 	xfs_iunlock(i_tab[0], lock_mode);
 	for (i = 1; i < 4; i++) {
-		if (i_tab[i] == NULL) {
+		if (i_tab[i] == NULL)
 			break;
-		}
+
 		/*
 		 * Watch out for duplicate entries in the table.
 		 */
-		if (i_tab[i] != i_tab[i-1]) {
+		if (i_tab[i] != i_tab[i-1])
 			xfs_iunlock(i_tab[i], lock_mode);
-		}
 	}
 }
 
-#ifdef DEBUG
-int xfs_rename_skip, xfs_rename_nskip;
-#endif
-
 /*
- * The following routine will acquire the locks required for a rename
- * operation. The code understands the semantics of renames and will
- * validate that name1 exists under dp1 & that name2 may or may not
- * exist under dp2.
- *
- * We are renaming dp1/name1 to dp2/name2.
- *
- * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success.
+ * Enter all inodes for a rename transaction into a sorted array.
  */
-STATIC int
-xfs_lock_for_rename(
+STATIC void
+xfs_sort_for_rename(
 	xfs_inode_t	*dp1,	/* in: old (source) directory inode */
 	xfs_inode_t	*dp2,	/* in: new (target) directory inode */
 	xfs_inode_t	*ip1,	/* in: inode of old entry */
-	struct xfs_name	*name2,	/* in: new entry name */
-	xfs_inode_t	**ipp2,	/* out: inode of new entry, if it
+	xfs_inode_t	*ip2,	/* in: inode of new entry, if it
 				   already exists, NULL otherwise. */
 	xfs_inode_t	**i_tab,/* out: array of inode returned, sorted */
 	int		*num_inodes)  /* out: number of inodes in array */
 {
-	xfs_inode_t		*ip2 = NULL;
 	xfs_inode_t		*temp;
-	xfs_ino_t		inum1, inum2;
-	int			error;
 	int			i, j;
-	uint			lock_mode;
-	int			diff_dirs = (dp1 != dp2);
-
-	/*
-	 * First, find out the current inums of the entries so that we
-	 * can determine the initial locking order.  We'll have to
-	 * sanity check stuff after all the locks have been acquired
-	 * to see if we still have the right inodes, directories, etc.
-	 */
-	lock_mode = xfs_ilock_map_shared(dp1);
-	IHOLD(ip1);
-	xfs_itrace_ref(ip1);
-
-	inum1 = ip1->i_ino;
-
-	/*
-	 * Unlock dp1 and lock dp2 if they are different.
-	 */
-	if (diff_dirs) {
-		xfs_iunlock_map_shared(dp1, lock_mode);
-		lock_mode = xfs_ilock_map_shared(dp2);
-	}
-
-	error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
-	if (error == ENOENT) {		/* target does not need to exist. */
-		inum2 = 0;
-	} else if (error) {
-		/*
-		 * If dp2 and dp1 are the same, the next line unlocks dp1.
-		 * Got it?
-		 */
-		xfs_iunlock_map_shared(dp2, lock_mode);
-		IRELE (ip1);
-		return error;
-	} else {
-		xfs_itrace_ref(ip2);
-	}
 
 	/*
 	 * i_tab contains a list of pointers to inodes.  We initialize
@@ -145,21 +92,20 @@ xfs_lock_for_rename(
 	i_tab[0] = dp1;
 	i_tab[1] = dp2;
 	i_tab[2] = ip1;
-	if (inum2 == 0) {
-		*num_inodes = 3;
-		i_tab[3] = NULL;
-	} else {
+	if (ip2) {
 		*num_inodes = 4;
 		i_tab[3] = ip2;
+	} else {
+		*num_inodes = 3;
+		i_tab[3] = NULL;
 	}
-	*ipp2 = i_tab[3];
 
 	/*
 	 * Sort the elements via bubble sort.  (Remember, there are at
 	 * most 4 elements to sort, so this is adequate.)
 	 */
-	for (i=0; i < *num_inodes; i++) {
-		for (j=1; j < *num_inodes; j++) {
+	for (i = 0; i < *num_inodes; i++) {
+		for (j = 1; j < *num_inodes; j++) {
 			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
 				temp = i_tab[j];
 				i_tab[j] = i_tab[j-1];
@@ -167,30 +113,6 @@ xfs_lock_for_rename(
 			}
 		}
 	}
-
-	/*
-	 * We have dp2 locked. If it isn't first, unlock it.
-	 * If it is first, tell xfs_lock_inodes so it can skip it
-	 * when locking. if dp1 == dp2, xfs_lock_inodes will skip both
-	 * since they are equal. xfs_lock_inodes needs all these inodes
-	 * so that it can unlock and retry if there might be a dead-lock
-	 * potential with the log.
-	 */
-
-	if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) {
-#ifdef DEBUG
-		xfs_rename_skip++;
-#endif
-		xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED);
-	} else {
-#ifdef DEBUG
-		xfs_rename_nskip++;
-#endif
-		xfs_iunlock_map_shared(dp2, lock_mode);
-		xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
-	}
-
-	return 0;
 }
 
 /*
@@ -202,10 +124,10 @@ xfs_rename(
 	struct xfs_name	*src_name,
 	xfs_inode_t	*src_ip,
 	xfs_inode_t	*target_dp,
-	struct xfs_name	*target_name)
+	struct xfs_name	*target_name,
+	xfs_inode_t	*target_ip)
 {
-	xfs_trans_t	*tp;
-	xfs_inode_t	*target_ip;
+	xfs_trans_t	*tp = NULL;
 	xfs_mount_t	*mp = src_dp->i_mount;
 	int		new_parent;		/* moving to a new dir */
 	int		src_is_directory;	/* src_name is a directory */
@@ -215,9 +137,7 @@ xfs_rename(
 	int		cancel_flags;
 	int		committed;
 	xfs_inode_t	*inodes[4];
-	int		target_ip_dropped = 0;	/* dropped target_ip link? */
 	int		spaceres;
-	int		target_link_zero = 0;
 	int		num_inodes;
 
 	xfs_itrace_entry(src_dp);
@@ -230,64 +150,27 @@ xfs_rename(
 					target_dp, DM_RIGHT_NULL,
 					src_name->name, target_name->name,
 					0, 0, 0);
-		if (error) {
+		if (error)
 			return error;
-		}
 	}
 	/* Return through std_return after this point. */
 
-	/*
-	 * Lock all the participating inodes. Depending upon whether
-	 * the target_name exists in the target directory, and
-	 * whether the target directory is the same as the source
-	 * directory, we can lock from 2 to 4 inodes.
-	 * xfs_lock_for_rename() will return ENOENT if src_name
-	 * does not exist in the source directory.
-	 */
-	tp = NULL;
-	error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
-					&target_ip, inodes, &num_inodes);
-	if (error) {
-		/*
-		 * We have nothing locked, no inode references, and
-		 * no transaction, so just get out.
-		 */
-		goto std_return;
-	}
-
-	ASSERT(src_ip != NULL);
+	new_parent = (src_dp != target_dp);
+	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
 
-	if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+	if (src_is_directory) {
 		/*
 		 * Check for link count overflow on target_dp
 		 */
-		if (target_ip == NULL && (src_dp != target_dp) &&
+		if (target_ip == NULL && new_parent &&
 		    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
 			error = XFS_ERROR(EMLINK);
-			xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
-			goto rele_return;
+			goto std_return;
 		}
 	}
 
-	/*
-	 * If we are using project inheritance, we only allow renames
-	 * into our tree when the project IDs are the same; else the
-	 * tree quota mechanism would be circumvented.
-	 */
-	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
-		error = XFS_ERROR(EXDEV);
-		xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
-		goto rele_return;
-	}
-
-	new_parent = (src_dp != target_dp);
-	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	/*
-	 * Drop the locks on our inodes so that we can start the transaction.
-	 */
-	xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+				inodes, &num_inodes);
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
@@ -302,7 +185,7 @@ xfs_rename(
 	}
 	if (error) {
 		xfs_trans_cancel(tp, 0);
-		goto rele_return;
+		goto std_return;
 	}
 
 	/*
@@ -310,13 +193,29 @@ xfs_rename(
 	 */
 	if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
 		xfs_trans_cancel(tp, cancel_flags);
-		goto rele_return;
+		goto std_return;
 	}
 
 	/*
-	 * Reacquire the inode locks we dropped above.
+	 * Lock all the participating inodes. Depending upon whether
+	 * the target_name exists in the target directory, and
+	 * whether the target directory is the same as the source
+	 * directory, we can lock from 2 to 4 inodes.
+	 */
+	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we are using project inheritance, we only allow renames
+	 * into our tree when the project IDs are the same; else the
+	 * tree quota mechanism would be circumvented.
 	 */
-	xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL);
+	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+		error = XFS_ERROR(EXDEV);
+		xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+		xfs_trans_cancel(tp, cancel_flags);
+		goto std_return;
+	}
 
 	/*
 	 * Join all the inodes to the transaction. From this point on,
@@ -328,17 +227,17 @@ xfs_rename(
 	 */
 	IHOLD(src_dp);
 	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+
 	if (new_parent) {
 		IHOLD(target_dp);
 		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
 	}
-	if ((src_ip != src_dp) && (src_ip != target_dp)) {
-		xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-	}
-	if ((target_ip != NULL) &&
-	    (target_ip != src_ip) &&
-	    (target_ip != src_dp) &&
-	    (target_ip != target_dp)) {
+
+	IHOLD(src_ip);
+	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+
+	if (target_ip) {
+		IHOLD(target_ip);
 		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
 	}
 
@@ -412,7 +311,6 @@ xfs_rename(
 		error = xfs_droplink(tp, target_ip);
 		if (error)
 			goto abort_return;
-		target_ip_dropped = 1;
 
 		if (src_is_directory) {
 			/*
@@ -422,10 +320,6 @@ xfs_rename(
 			if (error)
 				goto abort_return;
 		}
-
-		/* Do this test while we still hold the locks */
-		target_link_zero = (target_ip)->i_d.di_nlink==0;
-
 	} /* target_ip != NULL */
 
 	/*
@@ -492,15 +386,6 @@ xfs_rename(
 	}
 
 	/*
-	 * If there was a target inode, take an extra reference on
-	 * it here so that it doesn't go to xfs_inactive() from
-	 * within the commit.
-	 */
-	if (target_ip != NULL) {
-		IHOLD(target_ip);
-	}
-
-	/*
 	 * If this is a synchronous mount, make sure that the
 	 * rename transaction goes to disk before returning to
 	 * the user.
@@ -509,30 +394,11 @@ xfs_rename(
 		xfs_trans_set_sync(tp);
 	}
 
-	/*
-	 * Take refs. for vop_link_removed calls below.  No need to worry
-	 * about directory refs. because the caller holds them.
-	 *
-	 * Do holds before the xfs_bmap_finish since it might rele them down
-	 * to zero.
-	 */
-
-	if (target_ip_dropped)
-		IHOLD(target_ip);
-	IHOLD(src_ip);
-
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error) {
 		xfs_bmap_cancel(&free_list);
 		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT));
-		if (target_ip != NULL) {
-			IRELE(target_ip);
-		}
-		if (target_ip_dropped) {
-			IRELE(target_ip);
-		}
-		IRELE(src_ip);
 		goto std_return;
 	}
 
@@ -541,15 +407,6 @@ xfs_rename(
 	 * the vnode references.
 	 */
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (target_ip != NULL)
-		IRELE(target_ip);
-	/*
-	 * Let interposed file systems know about removed links.
-	 */
-	if (target_ip_dropped)
-		IRELE(target_ip);
-
-	IRELE(src_ip);
 
 	/* Fall through to std_return with error = 0 or errno from
 	 * xfs_trans_commit	 */
@@ -571,11 +428,4 @@ std_return:
 	xfs_bmap_cancel(&free_list);
 	xfs_trans_cancel(tp, cancel_flags);
 	goto std_return;
-
- rele_return:
-	IRELE(src_ip);
-	if (target_ip != NULL) {
-		IRELE(target_ip);
-	}
-	goto std_return;
 }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index b8db1d5cde5..4c70bf5e998 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -111,13 +111,13 @@ xfs_trans_iget(
 		 */
 		ASSERT(ip->i_itemp != NULL);
 		ASSERT(lock_flags & XFS_ILOCK_EXCL);
-		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-		       ismrlocked(&ip->i_iolock, MR_UPDATE));
+		       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
 		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
 		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-		       ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS)));
+		       xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
 		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
 		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
 
@@ -185,7 +185,7 @@ xfs_trans_ijoin(
 	xfs_inode_log_item_t	*iip;
 
 	ASSERT(ip->i_transp == NULL);
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(lock_flags & XFS_ILOCK_EXCL);
 	if (ip->i_itemp == NULL)
 		xfs_inode_item_init(ip, ip->i_mount);
@@ -232,7 +232,7 @@ xfs_trans_ihold(
 {
 	ASSERT(ip->i_transp == tp);
 	ASSERT(ip->i_itemp != NULL);
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
 }
@@ -257,7 +257,7 @@ xfs_trans_log_inode(
 
 	ASSERT(ip->i_transp == tp);
 	ASSERT(ip->i_itemp != NULL);
-	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
 	ASSERT(lidp != NULL);
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 2b8dc7e4077..98e5f110ba5 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -41,49 +41,6 @@
 #include "xfs_utils.h"
 
 
-int
-xfs_dir_lookup_int(
-	xfs_inode_t	*dp,
-	uint		lock_mode,
-	struct xfs_name	*name,
-	xfs_ino_t	*inum,
-	xfs_inode_t	**ipp)
-{
-	int		error;
-
-	xfs_itrace_entry(dp);
-
-	error = xfs_dir_lookup(NULL, dp, name, inum);
-	if (!error) {
-		/*
-		 * Unlock the directory. We do this because we can't
-		 * hold the directory lock while doing the vn_get()
-		 * in xfs_iget().  Doing so could cause us to hold
-		 * a lock while waiting for the inode to finish
-		 * being inactive while it's waiting for a log
-		 * reservation in the inactive routine.
-		 */
-		xfs_iunlock(dp, lock_mode);
-		error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0);
-		xfs_ilock(dp, lock_mode);
-
-		if (error) {
-			*ipp = NULL;
-		} else if ((*ipp)->i_d.di_mode == 0) {
-			/*
-			 * The inode has been freed.  Something is
-			 * wrong so just get out of here.
-			 */
-			xfs_iunlock(dp, lock_mode);
-			xfs_iput_new(*ipp, 0);
-			*ipp = NULL;
-			xfs_ilock(dp, lock_mode);
-			error = XFS_ERROR(ENOENT);
-		}
-	}
-	return error;
-}
-
 /*
  * Allocates a new inode from disk and return a pointer to the
  * incore copy. This routine will internally commit the current
@@ -310,7 +267,7 @@ xfs_bump_ino_vers2(
 {
 	xfs_mount_t	*mp;
 
-	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
 
 	ip->i_d.di_version = XFS_DINODE_VERSION_2;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 175b126d2ca..f316cb85d8e 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,8 +21,6 @@
 #define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
 
-extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
-				xfs_ino_t *, xfs_inode_t **);
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
 				xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index fc48158fe47..30bacd8bb0e 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -186,6 +186,7 @@ xfs_cleanup(void)
 	kmem_zone_destroy(xfs_efi_zone);
 	kmem_zone_destroy(xfs_ifork_zone);
 	kmem_zone_destroy(xfs_ili_zone);
+	kmem_zone_destroy(xfs_log_ticket_zone);
 }
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6650601c64f..70702a60b4b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -76,132 +76,6 @@ xfs_open(
 }
 
 /*
- * xfs_getattr
- */
-int
-xfs_getattr(
-	xfs_inode_t	*ip,
-	bhv_vattr_t	*vap,
-	int		flags)
-{
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
-	xfs_mount_t	*mp = ip->i_mount;
-
-	xfs_itrace_entry(ip);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	if (!(flags & ATTR_LAZY))
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-
-	vap->va_size = XFS_ISIZE(ip);
-	if (vap->va_mask == XFS_AT_SIZE)
-		goto all_done;
-
-	vap->va_nblocks =
-		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
-	vap->va_nodeid = ip->i_ino;
-#if XFS_BIG_INUMS
-	vap->va_nodeid += mp->m_inoadd;
-#endif
-	vap->va_nlink = ip->i_d.di_nlink;
-
-	/*
-	 * Quick exit for non-stat callers
-	 */
-	if ((vap->va_mask &
-	    ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
-	      XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
-		goto all_done;
-
-	/*
-	 * Copy from in-core inode.
-	 */
-	vap->va_mode = ip->i_d.di_mode;
-	vap->va_uid = ip->i_d.di_uid;
-	vap->va_gid = ip->i_d.di_gid;
-	vap->va_projid = ip->i_d.di_projid;
-
-	/*
-	 * Check vnode type block/char vs. everything else.
-	 */
-	switch (ip->i_d.di_mode & S_IFMT) {
-	case S_IFBLK:
-	case S_IFCHR:
-		vap->va_rdev = ip->i_df.if_u2.if_rdev;
-		vap->va_blocksize = BLKDEV_IOSIZE;
-		break;
-	default:
-		vap->va_rdev = 0;
-
-		if (!(XFS_IS_REALTIME_INODE(ip))) {
-			vap->va_blocksize = xfs_preferred_iosize(mp);
-		} else {
-
-			/*
-			 * If the file blocks are being allocated from a
-			 * realtime partition, then return the inode's
-			 * realtime extent size or the realtime volume's
-			 * extent size.
-			 */
-			vap->va_blocksize =
-				xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
-		}
-		break;
-	}
-
-	vn_atime_to_timespec(vp, &vap->va_atime);
-	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
-	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
-	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
-	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
-
-	/*
-	 * Exit for stat callers.  See if any of the rest of the fields
-	 * to be filled in are needed.
-	 */
-	if ((vap->va_mask &
-	     (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
-	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
-		goto all_done;
-
-	/*
-	 * Convert di_flags to xflags.
-	 */
-	vap->va_xflags = xfs_ip2xflags(ip);
-
-	/*
-	 * Exit for inode revalidate.  See if any of the rest of
-	 * the fields to be filled in are needed.
-	 */
-	if ((vap->va_mask &
-	     (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
-	      XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
-		goto all_done;
-
-	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
-	vap->va_nextents =
-		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
-			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
-			ip->i_d.di_nextents;
-	if (ip->i_afp)
-		vap->va_anextents =
-			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
-				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
-				 ip->i_d.di_anextents;
-	else
-		vap->va_anextents = 0;
-	vap->va_gen = ip->i_d.di_gen;
-
- all_done:
-	if (!(flags & ATTR_LAZY))
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return 0;
-}
-
-
-/*
  * xfs_setattr
  */
 int
@@ -211,7 +85,6 @@ xfs_setattr(
 	int			flags,
 	cred_t			*credp)
 {
-	bhv_vnode_t		*vp = XFS_ITOV(ip);
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_trans_t		*tp;
 	int			mask;
@@ -222,7 +95,6 @@ xfs_setattr(
 	gid_t			gid=0, igid=0;
 	int			timeflags = 0;
 	xfs_prid_t		projid=0, iprojid=0;
-	int			mandlock_before, mandlock_after;
 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
 	int			file_owner;
 	int			need_iolock = 1;
@@ -383,7 +255,7 @@ xfs_setattr(
 				m |= S_ISGID;
 #if 0
 			/* Linux allows this, Irix doesn't. */
-			if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
+			if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
 				m |= S_ISVTX;
 #endif
 			if (m && !capable(CAP_FSETID))
@@ -461,10 +333,10 @@ xfs_setattr(
 			goto error_return;
 		}
 
-		if (VN_ISDIR(vp)) {
+		if (S_ISDIR(ip->i_d.di_mode)) {
 			code = XFS_ERROR(EISDIR);
 			goto error_return;
-		} else if (!VN_ISREG(vp)) {
+		} else if (!S_ISREG(ip->i_d.di_mode)) {
 			code = XFS_ERROR(EINVAL);
 			goto error_return;
 		}
@@ -626,9 +498,6 @@ xfs_setattr(
 		xfs_trans_ihold(tp, ip);
 	}
 
-	/* determine whether mandatory locking mode changes */
-	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
-
 	/*
 	 * Truncate file.  Must have write permission and not be a directory.
 	 */
@@ -858,13 +727,6 @@ xfs_setattr(
 		code = xfs_trans_commit(tp, commit_flags);
 	}
 
-	/*
-	 * If the (regular) file's mandatory locking mode changed, then
-	 * notify the vnode.  We do this under the inode lock to prevent
-	 * racing calls to vop_vnode_change.
-	 */
-	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
-
 	xfs_iunlock(ip, lock_flags);
 
 	/*
@@ -1443,7 +1305,7 @@ xfs_inactive_attrs(
 	int		error;
 	xfs_mount_t	*mp;
 
-	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	tp = *tpp;
 	mp = ip->i_mount;
 	ASSERT(ip->i_d.di_forkoff != 0);
@@ -1491,7 +1353,7 @@ xfs_release(
 	xfs_mount_t	*mp = ip->i_mount;
 	int		error;
 
-	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
+	if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
 		return 0;
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1774,8 +1636,7 @@ xfs_lookup(
 	struct xfs_name		*name,
 	xfs_inode_t		**ipp)
 {
-	xfs_inode_t		*ip;
-	xfs_ino_t		e_inum;
+	xfs_ino_t		inum;
 	int			error;
 	uint			lock_mode;
 
@@ -1785,12 +1646,21 @@ xfs_lookup(
 		return XFS_ERROR(EIO);
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
-	if (!error) {
-		*ipp = ip;
-		xfs_itrace_ref(ip);
-	}
+	error = xfs_dir_lookup(NULL, dp, name, &inum);
 	xfs_iunlock_map_shared(dp, lock_mode);
+
+	if (error)
+		goto out;
+
+	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
+	if (error)
+		goto out;
+
+	xfs_itrace_ref(*ipp);
+	return 0;
+
+ out:
+	*ipp = NULL;
 	return error;
 }
 
@@ -1906,7 +1776,7 @@ xfs_create(
 	 * It is locked (and joined to the transaction).
 	 */
 
-	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
 	/*
 	 * Now we join the directory inode to the transaction.  We do not do it
@@ -2112,7 +1982,7 @@ again:
 
 		ips[0] = ip;
 		ips[1] = dp;
-		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+		xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
 	}
 	/* else	 e_inum == dp->i_ino */
 	/*     This can happen if we're asked to lock /x/..
@@ -2160,7 +2030,6 @@ void
 xfs_lock_inodes(
 	xfs_inode_t	**ips,
 	int		inodes,
-	int		first_locked,
 	uint		lock_mode)
 {
 	int		attempts = 0, i, j, try_lock;
@@ -2168,13 +2037,8 @@ xfs_lock_inodes(
 
 	ASSERT(ips && (inodes >= 2)); /* we need at least two */
 
-	if (first_locked) {
-		try_lock = 1;
-		i = 1;
-	} else {
-		try_lock = 0;
-		i = 0;
-	}
+	try_lock = 0;
+	i = 0;
 
 again:
 	for (; i < inodes; i++) {
@@ -2298,29 +2162,14 @@ xfs_remove(
 			return error;
 	}
 
-	/*
-	 * We need to get a reference to ip before we get our log
-	 * reservation. The reason for this is that we cannot call
-	 * xfs_iget for an inode for which we do not have a reference
-	 * once we've acquired a log reservation. This is because the
-	 * inode we are trying to get might be in xfs_inactive going
-	 * for a log reservation. Since we'll have to wait for the
-	 * inactive code to complete before returning from xfs_iget,
-	 * we need to make sure that we don't have log space reserved
-	 * when we call xfs_iget.  Instead we get an unlocked reference
-	 * to the inode before getting our log reservation.
-	 */
-	IHOLD(ip);
-
 	xfs_itrace_entry(ip);
 	xfs_itrace_ref(ip);
 
 	error = XFS_QM_DQATTACH(mp, dp, 0);
-	if (!error && dp != ip)
+	if (!error)
 		error = XFS_QM_DQATTACH(mp, ip, 0);
 	if (error) {
 		REMOVE_DEBUG_TRACE(__LINE__);
-		IRELE(ip);
 		goto std_return;
 	}
 
@@ -2347,7 +2196,6 @@ xfs_remove(
 		ASSERT(error != ENOSPC);
 		REMOVE_DEBUG_TRACE(__LINE__);
 		xfs_trans_cancel(tp, 0);
-		IRELE(ip);
 		return error;
 	}
 
@@ -2355,7 +2203,6 @@ xfs_remove(
 	if (error) {
 		REMOVE_DEBUG_TRACE(__LINE__);
 		xfs_trans_cancel(tp, cancel_flags);
-		IRELE(ip);
 		goto std_return;
 	}
 
@@ -2363,23 +2210,18 @@ xfs_remove(
 	 * At this point, we've gotten both the directory and the entry
 	 * inodes locked.
 	 */
+	IHOLD(ip);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-	if (dp != ip) {
-		/*
-		 * Increment vnode ref count only in this case since
-		 * there's an extra vnode reference in the case where
-		 * dp == ip.
-		 */
-		IHOLD(dp);
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	}
+
+	IHOLD(dp);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
 	 */
 	XFS_BMAP_INIT(&free_list, &first_block);
 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
-					&first_block, &free_list, 0);
+					&first_block, &free_list, resblks);
 	if (error) {
 		ASSERT(error != ENOENT);
 		REMOVE_DEBUG_TRACE(__LINE__);
@@ -2402,12 +2244,6 @@ xfs_remove(
 	link_zero = (ip)->i_d.di_nlink==0;
 
 	/*
-	 * Take an extra ref on the inode so that it doesn't
-	 * go to xfs_inactive() from within the commit.
-	 */
-	IHOLD(ip);
-
-	/*
 	 * If this is a synchronous mount, make sure that the
 	 * remove transaction goes to disk before returning to
 	 * the user.
@@ -2423,10 +2259,8 @@ xfs_remove(
 	}
 
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (error) {
-		IRELE(ip);
+	if (error)
 		goto std_return;
-	}
 
 	/*
 	 * If we are using filestreams, kill the stream association.
@@ -2438,7 +2272,6 @@ xfs_remove(
 		xfs_filestream_deassociate(ip);
 
 	xfs_itrace_exit(ip);
-	IRELE(ip);
 
 /*	Fall through to std_return with error = 0 */
  std_return:
@@ -2467,8 +2300,6 @@ xfs_remove(
 	cancel_flags |= XFS_TRANS_ABORT;
 	xfs_trans_cancel(tp, cancel_flags);
 
-	IRELE(ip);
-
 	goto std_return;
 }
 
@@ -2536,7 +2367,7 @@ xfs_link(
 		ips[1] = sip;
 	}
 
-	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+	xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
 
 	/*
 	 * Increment vnode ref counts since xfs_trans_commit &
@@ -2840,7 +2671,6 @@ xfs_rmdir(
 	struct xfs_name		*name,
 	xfs_inode_t		*cdp)
 {
-	bhv_vnode_t		*dir_vp = XFS_ITOV(dp);
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t             *tp;
 	int                     error;
@@ -2866,27 +2696,12 @@ xfs_rmdir(
 	}
 
 	/*
-	 * We need to get a reference to cdp before we get our log
-	 * reservation.  The reason for this is that we cannot call
-	 * xfs_iget for an inode for which we do not have a reference
-	 * once we've acquired a log reservation.  This is because the
-	 * inode we are trying to get might be in xfs_inactive going
-	 * for a log reservation.  Since we'll have to wait for the
-	 * inactive code to complete before returning from xfs_iget,
-	 * we need to make sure that we don't have log space reserved
-	 * when we call xfs_iget.  Instead we get an unlocked reference
-	 * to the inode before getting our log reservation.
-	 */
-	IHOLD(cdp);
-
-	/*
 	 * Get the dquots for the inodes.
 	 */
 	error = XFS_QM_DQATTACH(mp, dp, 0);
-	if (!error && dp != cdp)
+	if (!error)
 		error = XFS_QM_DQATTACH(mp, cdp, 0);
 	if (error) {
-		IRELE(cdp);
 		REMOVE_DEBUG_TRACE(__LINE__);
 		goto std_return;
 	}
@@ -2913,7 +2728,6 @@ xfs_rmdir(
 	if (error) {
 		ASSERT(error != ENOSPC);
 		cancel_flags = 0;
-		IRELE(cdp);
 		goto error_return;
 	}
 	XFS_BMAP_INIT(&free_list, &first_block);
@@ -2927,21 +2741,13 @@ xfs_rmdir(
 	error = xfs_lock_dir_and_entry(dp, cdp);
 	if (error) {
 		xfs_trans_cancel(tp, cancel_flags);
-		IRELE(cdp);
 		goto std_return;
 	}
 
+	IHOLD(dp);
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-	if (dp != cdp) {
-		/*
-		 * Only increment the parent directory vnode count if
-		 * we didn't bump it in looking up cdp.  The only time
-		 * we don't bump it is when we're looking up ".".
-		 */
-		VN_HOLD(dir_vp);
-	}
 
-	xfs_itrace_ref(cdp);
+	IHOLD(cdp);
 	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
 
 	ASSERT(cdp->i_d.di_nlink >= 2);
@@ -2995,12 +2801,6 @@ xfs_rmdir(
 	last_cdp_link = (cdp)->i_d.di_nlink==0;
 
 	/*
-	 * Take an extra ref on the child vnode so that it
-	 * does not go to xfs_inactive() from within the commit.
-	 */
-	IHOLD(cdp);
-
-	/*
 	 * If this is a synchronous mount, make sure that the
 	 * rmdir transaction goes to disk before returning to
 	 * the user.
@@ -3014,19 +2814,15 @@ xfs_rmdir(
 		xfs_bmap_cancel(&free_list);
 		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT));
-		IRELE(cdp);
 		goto std_return;
 	}
 
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error) {
-		IRELE(cdp);
 		goto std_return;
 	}
 
 
-	IRELE(cdp);
-
 	/* Fall through to std_return with error = 0 or the errno
 	 * from xfs_trans_commit. */
  std_return:
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 24c53923dc2..8abe8f186e2 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -15,7 +15,6 @@ struct xfs_iomap;
 
 
 int xfs_open(struct xfs_inode *ip);
-int xfs_getattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags);
 int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
 		struct cred *credp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
@@ -48,9 +47,9 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		struct cred *credp, int	attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
 		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
-		struct xfs_name *target_name);
+		struct xfs_name *target_name, struct xfs_inode *target_ip);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
-		int *valuelenp, int flags, cred_t *cred);
+		int *valuelenp, int flags);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
 		int valuelen, int flags);
 int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
@@ -61,9 +60,6 @@ int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
 		const struct iovec *iovp, unsigned int segs,
 		loff_t *offset, int ioflags);
-ssize_t xfs_sendfile(struct xfs_inode *ip, struct file *filp,
-		loff_t *offset, int ioflags, size_t count,
-		read_actor_t actor, void *target);
 ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
 		loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
 		int flags, int ioflags);