From bb0eb217c980d50c45f3e793b4dcc70ab9ee820d Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Tue, 12 Aug 2008 12:40:50 +0300
Subject: [MTD] Define and use MTD_FAIL_ADDR_UNKNOWN instead of 0xffffffff

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/erase.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dddb2a6c9e2..259461b910a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	instr->len = c->sector_size;
 	instr->callback = jffs2_erase_callback;
 	instr->priv = (unsigned long)(&instr[1]);
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
 	((struct erase_priv_struct *)instr->priv)->c = c;
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
-- 
cgit v1.2.3


From bde86fec7c822b6009d3cfefc20b76b8d34716af Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 14 Aug 2008 11:57:45 +0300
Subject: [JFFS2] Correct symlink name too long error code

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index cd219ef5525..b1aaae823a5 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -311,7 +311,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	/* FIXME: If you care. We'd need to use frags for the target
 	   if it grows much more than this */
 	if (targetlen > 254)
-		return -EINVAL;
+		return -ENAMETOOLONG;
 
 	ri = jffs2_alloc_raw_inode();
 
-- 
cgit v1.2.3


From 75caf6b5acc6b895df9bdd36db631220e1096e9f Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 18 Aug 2008 16:23:53 +0100
Subject: [JFFS2] Fill in f_fsid field in jffs2_statfs()

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/fs.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 086c4383022..89e9b735d8d 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;
 	buf->f_ffree = 0;
 	buf->f_namelen = JFFS2_MAX_NAME_LEN;
+	buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC;
+	buf->f_fsid.val[1] = c->mtd->index;
 
 	spin_lock(&c->erase_completion_lock);
 	avail = c->dirty_size + c->free_size;
-- 
cgit v1.2.3


From 31db6e9ea1dbdcf66b8227b4f7035dee1b1dd8c0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 29 Aug 2008 07:19:50 +0400
Subject: [JFFS2] Move JFFS2 config options out of fs/Kconfig

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/Kconfig       | 190 +------------------------------------------------------
 fs/jffs2/Kconfig | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+), 189 deletions(-)
 create mode 100644 fs/jffs2/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d3873583360..5831f9c3884 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1136,195 +1136,7 @@ config EFS_FS
 	  To compile the EFS file system support as a module, choose M here: the
 	  module will be called efs.
 
-config JFFS2_FS
-	tristate "Journalling Flash File System v2 (JFFS2) support"
-	select CRC32
-	depends on MTD
-	help
-	  JFFS2 is the second generation of the Journalling Flash File System
-	  for use on diskless embedded devices. It provides improved wear
-	  levelling, compression and support for hard links. You cannot use
-	  this on normal block devices, only on 'MTD' devices.
-
-	  Further information on the design and implementation of JFFS2 is
-	  available at <http://sources.redhat.com/jffs2/>.
-
-config JFFS2_FS_DEBUG
-	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
-	depends on JFFS2_FS
-	default "0"
-	help
-	  This controls the amount of debugging messages produced by the JFFS2
-	  code. Set it to zero for use in production systems. For evaluation,
-	  testing and debugging, it's advisable to set it to one. This will
-	  enable a few assertions and will print debugging messages at the
-	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
-	  is unlikely to be useful - it enables extra debugging in certain
-	  areas which at one point needed debugging, but when the bugs were
-	  located and fixed, the detailed messages were relegated to level 2.
-
-	  If reporting bugs, please try to have available a full dump of the
-	  messages at debug level 1 while the misbehaviour was occurring.
-
-config JFFS2_FS_WRITEBUFFER
-	bool "JFFS2 write-buffering support"
-	depends on JFFS2_FS
-	default y
-	help
-	  This enables the write-buffering support in JFFS2.
-
-	  This functionality is required to support JFFS2 on the following
-	  types of flash devices:
-	    - NAND flash
-	    - NOR flash with transparent ECC
-	    - DataFlash
-
-config JFFS2_FS_WBUF_VERIFY
-	bool "Verify JFFS2 write-buffer reads"
-	depends on JFFS2_FS_WRITEBUFFER
-	default n
-	help
-	  This causes JFFS2 to read back every page written through the
-	  write-buffer, and check for errors.
-
-config JFFS2_SUMMARY
-	bool "JFFS2 summary support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  This feature makes it possible to use summary information
-	  for faster filesystem mount.
-
-	  The summary information can be inserted into a filesystem image
-	  by the utility 'sumtool'.
-
-	  If unsure, say 'N'.
-
-config JFFS2_FS_XATTR
-	bool "JFFS2 XATTR support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config JFFS2_FS_POSIX_ACL
-	bool "JFFS2 POSIX Access Control Lists"
-	depends on JFFS2_FS_XATTR
-	default y
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config JFFS2_FS_SECURITY
-	bool "JFFS2 Security Labels"
-	depends on JFFS2_FS_XATTR
-	default y
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the jffs2 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JFFS2_COMPRESSION_OPTIONS
-	bool "Advanced compression options for JFFS2"
-	depends on JFFS2_FS
-	default n
-	help
-	  Enabling this option allows you to explicitly choose which
-	  compression modules, if any, are enabled in JFFS2. Removing
-	  compressors can mean you cannot read existing file systems,
-	  and enabling experimental compressors can mean that you
-	  write a file system which cannot be read by a standard kernel.
-
-	  If unsure, you should _definitely_ say 'N'.
-
-config JFFS2_ZLIB
-	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
-	select ZLIB_INFLATE
-	select ZLIB_DEFLATE
-	depends on JFFS2_FS
-	default y
-	help
-	  Zlib is designed to be a free, general-purpose, legally unencumbered,
-	  lossless data-compression library for use on virtually any computer
-	  hardware and operating system. See <http://www.gzip.org/zlib/> for
-	  further information.
-
-	  Say 'Y' if unsure.
-
-config JFFS2_LZO
-	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
-	select LZO_COMPRESS
-	select LZO_DECOMPRESS
-	depends on JFFS2_FS
-	default n
-	help
-	  minilzo-based compression. Generally works better than Zlib.
-
-	  This feature was added in July, 2007. Say 'N' if you need
-	  compatibility with older bootloaders or kernels.
-
-config JFFS2_RTIME
-	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
-	depends on JFFS2_FS
-	default y
-	help
-	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
-
-config JFFS2_RUBIN
-	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
-	depends on JFFS2_FS
-	default n
-	help
-	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
-
-choice
-	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
-	default JFFS2_CMODE_PRIORITY
-	depends on JFFS2_FS
-	help
-	  You can set here the default compression mode of JFFS2 from
-	  the available compression modes. Don't touch if unsure.
-
-config JFFS2_CMODE_NONE
-	bool "no compression"
-	help
-	  Uses no compression.
-
-config JFFS2_CMODE_PRIORITY
-	bool "priority"
-	help
-	  Tries the compressors in a predefined order and chooses the first
-	  successful one.
-
-config JFFS2_CMODE_SIZE
-	bool "size (EXPERIMENTAL)"
-	help
-	  Tries all compressors and chooses the one which has the smallest
-	  result.
-
-config JFFS2_CMODE_FAVOURLZO
-	bool "Favour LZO"
-	help
-	  Tries all compressors and chooses the one which has the smallest
-	  result but gives some preference to LZO (which has faster
-	  decompression) at the expense of size.
-
-endchoice
-
+source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
 
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
new file mode 100644
index 00000000000..6ae169cd8fa
--- /dev/null
+++ b/fs/jffs2/Kconfig
@@ -0,0 +1,188 @@
+config JFFS2_FS
+	tristate "Journalling Flash File System v2 (JFFS2) support"
+	select CRC32
+	depends on MTD
+	help
+	  JFFS2 is the second generation of the Journalling Flash File System
+	  for use on diskless embedded devices. It provides improved wear
+	  levelling, compression and support for hard links. You cannot use
+	  this on normal block devices, only on 'MTD' devices.
+
+	  Further information on the design and implementation of JFFS2 is
+	  available at <http://sources.redhat.com/jffs2/>.
+
+config JFFS2_FS_DEBUG
+	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
+	depends on JFFS2_FS
+	default "0"
+	help
+	  This controls the amount of debugging messages produced by the JFFS2
+	  code. Set it to zero for use in production systems. For evaluation,
+	  testing and debugging, it's advisable to set it to one. This will
+	  enable a few assertions and will print debugging messages at the
+	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
+	  is unlikely to be useful - it enables extra debugging in certain
+	  areas which at one point needed debugging, but when the bugs were
+	  located and fixed, the detailed messages were relegated to level 2.
+
+	  If reporting bugs, please try to have available a full dump of the
+	  messages at debug level 1 while the misbehaviour was occurring.
+
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
+	depends on JFFS2_FS
+	default y
+	help
+	  This enables the write-buffering support in JFFS2.
+
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
+
+config JFFS2_FS_WBUF_VERIFY
+	bool "Verify JFFS2 write-buffer reads"
+	depends on JFFS2_FS_WRITEBUFFER
+	default n
+	help
+	  This causes JFFS2 to read back every page written through the
+	  write-buffer, and check for errors.
+
+config JFFS2_SUMMARY
+	bool "JFFS2 summary support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  This feature makes it possible to use summary information
+	  for faster filesystem mount.
+
+	  The summary information can be inserted into a filesystem image
+	  by the utility 'sumtool'.
+
+	  If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFFS2_COMPRESSION_OPTIONS
+	bool "Advanced compression options for JFFS2"
+	depends on JFFS2_FS
+	default n
+	help
+	  Enabling this option allows you to explicitly choose which
+	  compression modules, if any, are enabled in JFFS2. Removing
+	  compressors can mean you cannot read existing file systems,
+	  and enabling experimental compressors can mean that you
+	  write a file system which cannot be read by a standard kernel.
+
+	  If unsure, you should _definitely_ say 'N'.
+
+config JFFS2_ZLIB
+	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	depends on JFFS2_FS
+	default y
+	help
+	  Zlib is designed to be a free, general-purpose, legally unencumbered,
+	  lossless data-compression library for use on virtually any computer
+	  hardware and operating system. See <http://www.gzip.org/zlib/> for
+	  further information.
+
+	  Say 'Y' if unsure.
+
+config JFFS2_LZO
+	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	depends on JFFS2_FS
+	default n
+	help
+	  minilzo-based compression. Generally works better than Zlib.
+
+	  This feature was added in July, 2007. Say 'N' if you need
+	  compatibility with older bootloaders or kernels.
+
+config JFFS2_RTIME
+	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default y
+	help
+	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
+
+config JFFS2_RUBIN
+	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default n
+	help
+	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
+
+choice
+	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
+	default JFFS2_CMODE_PRIORITY
+	depends on JFFS2_FS
+	help
+	  You can set here the default compression mode of JFFS2 from
+	  the available compression modes. Don't touch if unsure.
+
+config JFFS2_CMODE_NONE
+	bool "no compression"
+	help
+	  Uses no compression.
+
+config JFFS2_CMODE_PRIORITY
+	bool "priority"
+	help
+	  Tries the compressors in a predefined order and chooses the first
+	  successful one.
+
+config JFFS2_CMODE_SIZE
+	bool "size (EXPERIMENTAL)"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result.
+
+config JFFS2_CMODE_FAVOURLZO
+	bool "Favour LZO"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result but gives some preference to LZO (which has faster
+	  decompression) at the expense of size.
+
+endchoice
-- 
cgit v1.2.3


From 3fc678a0e63138f56109ea31850f19b2e29c45b8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Aug 2008 14:48:32 +0100
Subject: CRED: Wrap task credential accesses in the JFFS2 filesystem

Wrap access to task credentials so that they can be separated more easily from
the task_struct during the introduction of COW creds.

Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id().

Change some task->e?[ug]id to task_e?[ug]id().  In some places it makes more
sense to use RCU directly rather than a convenient wrapper; these will be
addressed by later patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 89e9b735d8d..249305d65d5 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -442,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 
 	memset(ri, 0, sizeof(*ri));
 	/* Set OS-specific defaults for new inodes */
-	ri->uid = cpu_to_je16(current->fsuid);
+	ri->uid = cpu_to_je16(current_fsuid());
 
 	if (dir_i->i_mode & S_ISGID) {
 		ri->gid = cpu_to_je16(dir_i->i_gid);
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 	} else {
-		ri->gid = cpu_to_je16(current->fsgid);
+		ri->gid = cpu_to_je16(current_fsgid());
 	}
 
 	/* POSIX ACLs have to be processed now, at least partly.
-- 
cgit v1.2.3


From b773ad40aca5bd755ba886620842f16e8fef6d75 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 31 Aug 2008 08:16:57 -0700
Subject: select: add poll_select_set_timeout() and
 poll_select_copy_remaining() helpers

This patch adds 2 helpers that will be used for the hrtimer based select/poll:

poll_select_set_timeout() is a helper that takes a timeout (as a second, nanosecond
pair) and turns that into a "struct timespec" that represents the absolute end time.
This is a common operation in the many select() and poll() variants and needs various,
common, sanity checks.

poll_select_copy_remaining() is a helper that takes care of copying the remaining
time to userspace, as select(), pselect() and ppoll() do. This function comes in
both a natural and a compat implementation (due to datastructure differences).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/compat.c | 51 +++++++++++++++++++++++++++++++++++++++++
 fs/select.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 075d0509970..424767c954a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1436,6 +1436,57 @@ out_ret:
 
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
 /*
  * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
  * 64-bit unsigned longs.
diff --git a/fs/select.c b/fs/select.c
index da0e88201c3..1180a620778 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -130,6 +130,81 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to:		pointer to timespec variable for the final timeout
+ * @sec:	seconds (from user space)
+ * @nsec:	nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+	if (!timespec_valid(&ts))
+		return -EINVAL;
+
+	/* Optimize for the zero timeout value here */
+	if (!sec && !nsec) {
+		to->tv_sec = to->tv_nsec = 0;
+	} else {
+		ktime_get_ts(to);
+		*to = timespec_add_safe(*to, ts);
+	}
+	return 0;
+}
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec rts;
+	struct timeval rtv;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&rts);
+	rts = timespec_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
+
+	if (timeval) {
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+
+	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+		return ret;
+
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+
+
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
-- 
cgit v1.2.3


From 8ff3e8e85fa6c312051134b3953e397feb639f51 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 31 Aug 2008 08:26:40 -0700
Subject: select: switch select() and poll() over to hrtimers

With lots of help, input and cleanups from Thomas Gleixner

This patch switches select() and poll() over to hrtimers.

The core of the patch is replacing the "s64 timeout" with a
"struct timespec end_time" in all the plumbing.

But most of the diffstat comes from using the just introduced helpers:
	poll_select_set_timeout
	poll_select_copy_remaining
	timespec_add_safe
which make manipulating the timespec easier and less error-prone.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/compat.c | 136 +++++--------------------------
 fs/select.c | 263 ++++++++++++++++++++----------------------------------------
 2 files changed, 108 insertions(+), 291 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 424767c954a..133ed7f5d68 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1568,7 +1568,8 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -1615,7 +1616,7 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -1641,7 +1642,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct compat_timeval tv;
 	int ret;
 
@@ -1649,43 +1650,14 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, tv.tv_sec,
+					    tv.tv_usec * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct compat_timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (compat_timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -1698,15 +1670,16 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 {
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -1721,51 +1694,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	do {
-		if (tsp) {
-			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
-				timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-				timeout += ts.tv_sec * (unsigned long)HZ;
-				ts.tv_sec = 0;
-				ts.tv_nsec = 0;
-			} else {
-				ts.tv_sec -= MAX_SELECT_SECONDS;
-				timeout = MAX_SELECT_SECONDS * HZ;
-			}
-		}
-
-		ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
-
-	if (tsp) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-
-		rts.tv_sec = timeout / HZ;
-		rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
-		if (rts.tv_nsec >= NSEC_PER_SEC) {
-			rts.tv_sec++;
-			rts.tv_nsec -= NSEC_PER_SEC;
-		}
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -1810,18 +1740,16 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct compat_timespec ts;
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* We assume that ts.tv_sec is always lower than
-		   the number of seconds that can be expressed in
-		   an s64. Otherwise the compiler bitches at us */
-		timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-		timeout += ts.tv_sec * HZ;
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -1835,7 +1763,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -1853,31 +1781,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-					1000;
-		rts.tv_sec = timeout;
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
diff --git a/fs/select.c b/fs/select.c
index 1180a620778..f6dceb56793 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -24,6 +24,7 @@
 #include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
@@ -203,8 +204,6 @@ sticky:
 	return ret;
 }
 
-
-
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
@@ -257,11 +256,12 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, s64 *timeout)
+int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
+	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
 	poll_table *wait;
-	int retval, i;
+	int retval, i, timed_out = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -273,12 +273,14 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 
 	poll_initwait(&table);
 	wait = &table.pt;
-	if (!*timeout)
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		wait = NULL;
+		timed_out = 1;
+	}
+
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -334,27 +336,25 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 			cond_resched();
 		}
 		wait = NULL;
-		if (retval || !*timeout || signal_pending(current))
+		if (retval || timed_out || signal_pending(current))
 			break;
 		if (table.error) {
 			retval = table.error;
 			break;
 		}
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
-			/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+
+		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -375,7 +375,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, s64 *timeout)
+			   fd_set __user *exp, struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -426,7 +426,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -452,7 +452,7 @@ out_nofds:
 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			fd_set __user *exp, struct timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct timeval tv;
 	int ret;
 
@@ -460,43 +460,14 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, tv.tv_sec,
+					    tv.tv_usec * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -506,25 +477,17 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		fd_set __user *exp, struct timespec __user *tsp,
 		const sigset_t __user *sigmask, size_t sigsetsize)
 {
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
 	}
 
 	if (sigmask) {
@@ -538,32 +501,8 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tsp) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, &end_time);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -649,18 +588,20 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
-		   struct poll_wqueues *wait, s64 *timeout)
+		   struct poll_wqueues *wait, struct timespec *end_time)
 {
-	int count = 0;
 	poll_table* pt = &wait->pt;
+	ktime_t expire, *to = NULL;
+	int timed_out = 0, count = 0;
 
 	/* Optimise the no-wait case */
-	if (!(*timeout))
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		pt = NULL;
+		timed_out = 1;
+	}
 
 	for (;;) {
 		struct poll_list *walk;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		for (walk = list; walk != NULL; walk = walk->next) {
@@ -692,27 +633,21 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			if (signal_pending(current))
 				count = -EINTR;
 		}
-		if (count || !*timeout)
+		if (count || timed_out)
 			break;
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
-			/*
-			 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
-			 * a loop
-			 */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
 
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 	return count;
@@ -721,7 +656,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 			sizeof(struct pollfd))
 
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+		struct timespec *end_time)
 {
 	struct poll_wqueues table;
  	int err = -EFAULT, fdcount, len, size;
@@ -761,7 +697,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	}
 
 	poll_initwait(&table);
-	fdcount = do_poll(nfds, head, &table, timeout);
+	fdcount = do_poll(nfds, head, &table, end_time);
 	poll_freewait(&table);
 
 	for (walk = head; walk; walk = walk->next) {
@@ -787,16 +723,21 @@ out_fds:
 
 static long do_restart_poll(struct restart_block *restart_block)
 {
-	struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0;
-	int nfds = restart_block->arg1;
-	s64 timeout = ((s64)restart_block->arg3<<32) | (s64)restart_block->arg2;
+	struct pollfd __user *ufds = restart_block->poll.ufds;
+	int nfds = restart_block->poll.nfds;
+	struct timespec *to = NULL, end_time;
 	int ret;
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	if (restart_block->poll.has_timeout) {
+		end_time.tv_sec = restart_block->poll.tv_sec;
+		end_time.tv_nsec = restart_block->poll.tv_nsec;
+		to = &end_time;
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		restart_block->fn = do_restart_poll;
-		restart_block->arg2 = timeout & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout >> 32;
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -805,31 +746,32 @@ static long do_restart_poll(struct restart_block *restart_block)
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 			long timeout_msecs)
 {
-	s64 timeout_jiffies;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
-	if (timeout_msecs > 0) {
-#if HZ > 1000
-		/* We can only overflow if HZ > 1000 */
-		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
-			timeout_jiffies = -1;
-		else
-#endif
-			timeout_jiffies = msecs_to_jiffies(timeout_msecs) + 1;
-	} else {
-		/* Infinite (< 0) or no (0) timeout */
-		timeout_jiffies = timeout_msecs;
+	if (timeout_msecs >= 0) {
+		to = &end_time;
+		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout_jiffies);
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
+
 		restart_block = &current_thread_info()->restart_block;
 		restart_block->fn = do_restart_poll;
-		restart_block->arg0 = (unsigned long)ufds;
-		restart_block->arg1 = nfds;
-		restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout_jiffies >> 32;
+		restart_block->poll.ufds = ufds;
+		restart_block->poll.nfds = nfds;
+
+		if (timeout_msecs >= 0) {
+			restart_block->poll.tv_sec = end_time.tv_sec;
+			restart_block->poll.tv_nsec = end_time.tv_nsec;
+			restart_block->poll.has_timeout = 1;
+		} else
+			restart_block->poll.has_timeout = 0;
+
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -841,21 +783,16 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	s64 timeout = -1;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -869,7 +806,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -887,31 +824,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-		sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 76369470b7e5f97fc1a8af83c45b9ff739b08cb6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:00:14 -0700
Subject: hrtimer: convert timerfd to the new hrtimer apis

In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts timerfd to these accessors.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/timerfd.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index c502c60e4f5..0862f0e49d0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -52,11 +52,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
-	ktime_t now, remaining;
-
-	now = ctx->tmr.base->get_time();
-	remaining = ktime_sub(ctx->tmr.expires, now);
+	ktime_t remaining;
 
+	remaining = hrtimer_expires_remaining(&ctx->tmr);
 	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
 
@@ -74,7 +72,7 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
 	ctx->ticks = 0;
 	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
 	hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
-	ctx->tmr.expires = texp;
+	hrtimer_set_expires(&ctx->tmr, texp);
 	ctx->tmr.function = timerfd_tmrproc;
 	if (texp.tv64 != 0)
 		hrtimer_start(&ctx->tmr, texp, htmode);
-- 
cgit v1.2.3


From 90d6e24a3686325edea7748b966e138c9923017d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 1 Sep 2008 15:55:35 -0700
Subject: hrtimer: make select() and poll() use the hrtimer range feature

This patch makes the select() and poll() hrtimers use the new range
feature and settings from the task struct.

In addition, this includes the estimate_accuracy() function that Linus
posted to lkml, but changed entirely based on other peoples lkml feedback.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index f6dceb56793..5e61b43d076 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -28,6 +28,58 @@
 
 #include <asm/uaccess.h>
 
+
+/*
+ * Estimate expected accuracy in ns from a timeval.
+ *
+ * After quite a bit of churning around, we've settled on
+ * a simple thing of taking 0.1% of the timeout as the
+ * slack, with a cap of 100 msec.
+ * "nice" tasks get a 0.5% slack instead.
+ *
+ * Consider this comment an open invitation to come up with even
+ * better solutions..
+ */
+
+static unsigned long __estimate_accuracy(struct timespec *tv)
+{
+	unsigned long slack;
+	int divfactor = 1000;
+
+	if (task_nice(current))
+		divfactor = divfactor / 5;
+
+	slack = tv->tv_nsec / divfactor;
+	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
+
+	if (slack > 100 * NSEC_PER_MSEC)
+		slack =  100 * NSEC_PER_MSEC;
+	return slack;
+}
+
+static unsigned long estimate_accuracy(struct timespec *tv)
+{
+	unsigned long ret;
+	struct timespec now;
+
+	/*
+	 * Realtime tasks get a slack of 0 for obvious reasons.
+	 */
+
+	if (current->policy == SCHED_FIFO ||
+		current->policy == SCHED_RR)
+		return 0;
+
+	ktime_get_ts(&now);
+	now = timespec_sub(*tv, now);
+	ret = __estimate_accuracy(&now);
+	if (ret < current->timer_slack_ns)
+		return current->timer_slack_ns;
+	return ret;
+}
+
+
+
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
@@ -262,6 +314,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	struct poll_wqueues table;
 	poll_table *wait;
 	int retval, i, timed_out = 0;
+	unsigned long slack = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -278,6 +331,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		timed_out = 1;
 	}
 
+	if (end_time)
+		slack = estimate_accuracy(end_time);
+
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
@@ -353,7 +409,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
@@ -593,6 +649,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	poll_table* pt = &wait->pt;
 	ktime_t expire, *to = NULL;
 	int timed_out = 0, count = 0;
+	unsigned long slack = 0;
 
 	/* Optimise the no-wait case */
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -600,6 +657,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		timed_out = 1;
 	}
 
+	if (end_time)
+		slack = estimate_accuracy(end_time);
+
 	for (;;) {
 		struct poll_list *walk;
 
@@ -646,7 +706,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS))
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From 4ce105d30e08fb8a1783c55a0e48aa3fa200c455 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 15:31:39 -0700
Subject: hrtimer: incorporate feedback from Peter Zijlstra

(based on  lkml review)
* use rt_task()
* task_nice() has a sign

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 5e61b43d076..fdd8584e536 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -46,7 +46,7 @@ static unsigned long __estimate_accuracy(struct timespec *tv)
 	unsigned long slack;
 	int divfactor = 1000;
 
-	if (task_nice(current))
+	if (task_nice(current) > 0)
 		divfactor = divfactor / 5;
 
 	slack = tv->tv_nsec / divfactor;
@@ -66,8 +66,7 @@ static unsigned long estimate_accuracy(struct timespec *tv)
 	 * Realtime tasks get a slack of 0 for obvious reasons.
 	 */
 
-	if (current->policy == SCHED_FIFO ||
-		current->policy == SCHED_RR)
+	if (rt_task(current))
 		return 0;
 
 	ktime_get_ts(&now);
-- 
cgit v1.2.3


From 96d2ab484e7a9bafdab44b8c7d1ef5944319b18c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 7 Sep 2008 16:08:55 -0700
Subject: hrtimer: fix signed/unsigned bug in slack estimator

the slack estimator used unsigned math; however for very short delay it's
possible that by the time you calculate the timeout, it's already passed and
you get a negative time/slack... in an unsigned variable... which then gets
turned into a 100 msec delay rather than zero.

This patch fixes this by using a signed typee in the right places.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/select.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index fdd8584e536..448e4400128 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -41,9 +41,9 @@
  * better solutions..
  */
 
-static unsigned long __estimate_accuracy(struct timespec *tv)
+static long __estimate_accuracy(struct timespec *tv)
 {
-	unsigned long slack;
+	long slack;
 	int divfactor = 1000;
 
 	if (task_nice(current) > 0)
@@ -54,10 +54,13 @@ static unsigned long __estimate_accuracy(struct timespec *tv)
 
 	if (slack > 100 * NSEC_PER_MSEC)
 		slack =  100 * NSEC_PER_MSEC;
+
+	if (slack < 0)
+		slack = 0;
 	return slack;
 }
 
-static unsigned long estimate_accuracy(struct timespec *tv)
+static long estimate_accuracy(struct timespec *tv)
 {
 	unsigned long ret;
 	struct timespec now;
@@ -330,7 +333,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		timed_out = 1;
 	}
 
-	if (end_time)
+	if (end_time && !timed_out)
 		slack = estimate_accuracy(end_time);
 
 	retval = 0;
@@ -656,7 +659,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		timed_out = 1;
 	}
 
-	if (end_time)
+	if (end_time && !timed_out)
 		slack = estimate_accuracy(end_time);
 
 	for (;;) {
-- 
cgit v1.2.3


From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang

Overview

This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling.  It was put together
with the help of Roland McGrath, the owner and original writer of this code.

The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads.  It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.

This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."

Code Changes

This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine.  (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.)  To do this, at each tick we now update fields in
signal_struct as well as task_struct.  The run_posix_cpu_timers() function
uses those fields to make its decisions.

We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:

struct task_cputime {
	cputime_t utime;
	cputime_t stime;
	unsigned long long sum_exec_runtime;
};

This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels.  For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:

struct thread_group_cputime {
	struct task_cputime totals;
};

struct thread_group_cputime {
	struct task_cputime *totals;
};

We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers).  The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends.  In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention).  For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu().  The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().

We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel.  The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields.  The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures.  The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated.  The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU.  Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.

Non-SMP operation is trivial and will not be mentioned further.

The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().

All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.

Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away.  All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline.  When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.

Performance

The fix appears not to add significant overhead to existing operations.  It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below).  Overall it's a wash except in those
two cases.

I've since done somewhat more involved testing on a dual-core Opteron system.

Case 1: With no itimer running, for a test with 100,000 threads, the fixed
	kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
	all of which was spent in the system.  There were twice as many
	voluntary context switches with the fix as without it.

Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
	an unmodified kernel can handle), the fixed kernel ran the test in
	eight percent of the time (5.8 seconds as opposed to 70 seconds) and
	had better tick accuracy (.012 seconds per tick as opposed to .023
	seconds per tick).

Case 3: A 4000-thread test with an initial timer tick of .01 second and an
	interval of 10,000 seconds (i.e. a timer that ticks only once) had
	very nearly the same performance in both cases:  6.3 seconds elapsed
	for the fixed kernel versus 5.5 seconds for the unfixed kernel.

With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds).  The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.

Since the fix affected the rlimit code, I also tested soft and hard CPU limits.

Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
	running), the modified kernel was very slightly favored in that while
	it killed the process in 19.997 seconds of CPU time (5.002 seconds of
	wall time), only .003 seconds of that was system time, the rest was
	user time.  The unmodified kernel killed the process in 20.001 seconds
	of CPU (5.014 seconds of wall time) of which .016 seconds was system
	time.  Really, though, the results were too close to call.  The results
	were essentially the same with no itimer running.

Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
	(where the hard limit would never be reached) and an itimer running,
	the modified kernel exhibited worse tick accuracy than the unmodified
	kernel: .050 seconds/tick versus .028 seconds/tick.  Otherwise,
	performance was almost indistinguishable.  With no itimer running this
	test exhibited virtually identical behavior and times in both cases.

In times past I did some limited performance testing.  those results are below.

On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s.  On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds.  Performance with eight, four and one
thread were comparable.  Interestingly, the timer ticks with the fix seemed
more accurate:  The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick.  Both cases were configured for an interval of
0.01 seconds.  Again, the other tests were comparable.  Each thread in this
test computed the primes up to 25,000,000.

I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix.  In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable).  System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s).  It received 147651 ticks for 0.015 seconds per tick, still quite
accurate.  There is obviously no comparable test without the fix.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/binfmt_elf.c | 19 +++++++------------
 fs/proc/array.c |  8 ++++----
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 655ed8d30a8..a8635f63703 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1333,20 +1333,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
 		/*
-		 * This is the record for the group leader.  Add in the
-		 * cumulative times of previous dead threads.  This total
-		 * won't include the time of each live thread whose state
-		 * is included in the core dump.  The final total reported
-		 * to our parent process when it calls wait4 will include
-		 * those sums as well as the little bit more time it takes
-		 * this and each other thread to finish dying after the
-		 * core dump synchronization phase.
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
 		 */
-		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
-				   &prstatus->pr_utime);
-		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
-				   &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 71c9be59c9c..933953c4e40 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -395,20 +395,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
+			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime = cputime_add(utime, task_utime(t));
-				stime = cputime_add(stime, task_stime(t));
 				gtime = cputime_add(gtime, task_gtime(t));
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime = cputime_add(utime, sig->utime);
-			stime = cputime_add(stime, sig->stime);
+			thread_group_cputime(task, &cputime);
+			utime = cputime.utime;
+			stime = cputime.stime;
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
-- 
cgit v1.2.3


From 948cfb219bbbc3c8e1b10a671ca88219fa42a052 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 20 Aug 2008 11:56:33 +0300
Subject: UBIFS: add a print, fix comments and more minor stuff

This commit adds a reserved pool size print and tweaks the
prints to make them look nicer.

It also fixes and cleans-up some comments.

Additionally, it deletes some blank lines to make the code look
a little nicer.

In other words, nothing essential.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 26 ++++++++++++++------------
 fs/ubifs/lprops.c |  6 +-----
 fs/ubifs/super.c  | 16 +++++++++-------
 3 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 73db464cd08..1a4973e1066 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -414,19 +414,21 @@ static int do_budget_space(struct ubifs_info *c)
 	 *    @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
 	 *    @c->lst.taken_empty_lebs
 	 *
-	 * @empty_lebs are available because they are empty. @freeable_cnt are
-	 * available because they contain only free and dirty space and the
-	 * index allocation always occurs after wbufs are synch'ed.
-	 * @idx_gc_cnt are available because they are index LEBs that have been
-	 * garbage collected (including trivial GC) and are awaiting the commit
-	 * before they can be unmapped - note that the in-the-gaps method will
-	 * grab these if it needs them. @taken_empty_lebs are empty_lebs that
-	 * have already been allocated for some purpose (also includes those
-	 * LEBs on the @idx_gc list).
+	 * @c->lst.empty_lebs are available because they are empty.
+	 * @c->freeable_cnt are available because they contain only free and
+	 * dirty space, @c->idx_gc_cnt are available because they are index
+	 * LEBs that have been garbage collected and are awaiting the commit
+	 * before they can be used. And the in-the-gaps method will grab these
+	 * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have
+	 * already been allocated for some purpose.
 	 *
-	 * Note, @taken_empty_lebs may temporarily be higher by one because of
-	 * the way we serialize LEB allocations and budgeting. See a comment in
-	 * 'ubifs_find_free_space()'.
+	 * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because
+	 * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they
+	 * are taken until after the commit).
+	 *
+	 * Note, @c->lst.taken_empty_lebs may temporarily be higher by one
+	 * because of the way we serialize LEB allocations and budgeting. See a
+	 * comment in 'ubifs_find_free_space()'.
 	 */
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 2ba93da71b6..3659b887743 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
 			}
 		}
 	}
+
 	/* Not greater than parent, so compare to children */
 	while (1) {
 		/* Compare to left child */
@@ -576,7 +577,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 	ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
 
 	spin_lock(&c->space_lock);
-
 	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
 		c->lst.taken_empty_lebs -= 1;
 
@@ -637,11 +637,8 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 		c->lst.taken_empty_lebs += 1;
 
 	change_category(c, lprops);
-
 	c->idx_gc_cnt += idx_gc_cnt;
-
 	spin_unlock(&c->space_lock);
-
 	return lprops;
 }
 
@@ -1262,7 +1259,6 @@ static int scan_check_cb(struct ubifs_info *c,
 	}
 
 	ubifs_scan_destroy(sleb);
-
 	return LPT_SCAN_CONTINUE;
 
 out_print:
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3f4902060c7..667c72d8a5c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1144,19 +1144,21 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (mounted_read_only)
 		ubifs_msg("mounted read-only");
 	x = (long long)c->main_lebs * c->leb_size;
-	ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
-		  x, x >> 10, x >> 20, c->main_lebs);
+	ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
+		  "LEBs)", x, x >> 10, x >> 20, c->main_lebs);
 	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
-	ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
-		  x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
-	ubifs_msg("media format %d, latest format %d",
+	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
+		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+	ubifs_msg("media format:       %d (latest is %d)",
 		  c->fmt_version, UBIFS_FORMAT_VERSION);
+	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+	ubifs_msg("reserved pool size: %llu bytes (%llu KiB)",
+		c->report_rp_size, c->report_rp_size >> 10);
 
 	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
 	dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
 	dbg_msg("LEB size:            %d bytes (%d KiB)",
-		c->leb_size, c->leb_size / 1024);
+		c->leb_size, c->leb_size >> 10);
 	dbg_msg("data journal heads:  %d",
 		c->jhead_cnt - NONDATA_JHEADS_CNT);
 	dbg_msg("UUID:                %02X%02X%02X%02X-%02X%02X"
-- 
cgit v1.2.3


From 8d47aef43ba166bdd11d522307c61ab23aab61c3 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
Date: Thu, 21 Aug 2008 17:16:40 +0300
Subject: UBIFS: remove unneeded unlikely()

IS_ERR() macro already has unlikely(), so do not use constructions
like 'if (unlikely(IS_ERR())'.

Signed-off-by: Hirofumi Nakagawa <hnakagawa@miraclelinux.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/find.c  | 4 ++--
 fs/ubifs/gc.c    | 8 ++++----
 fs/ubifs/tnc.c   | 4 ++--
 fs/ubifs/xattr.c | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 47814cde240..717d79c97c5 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -901,11 +901,11 @@ static int get_idx_gc_leb(struct ubifs_info *c)
 	 * it is needed now for this commit.
 	 */
 	lp = ubifs_lpt_lookup_dirty(c, lnum);
-	if (unlikely(IS_ERR(lp)))
+	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
 			     lp->flags | LPROPS_INDEX, -1);
-	if (unlikely(IS_ERR(lp)))
+	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 	dbg_find("LEB %d, dirty %d and free %d flags %#x",
 		 lp->lnum, lp->dirty, lp->free, lp->flags);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 02aba36fe3d..a6b633a5629 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -653,7 +653,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 	 */
 	while (1) {
 		lp = ubifs_fast_find_freeable(c);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			goto out;
 		}
@@ -665,7 +665,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 		if (err)
 			goto out;
 		lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			goto out;
 		}
@@ -680,7 +680,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 	/* Record index freeable LEBs for unmapping after commit */
 	while (1) {
 		lp = ubifs_fast_find_frdi_idx(c);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			goto out;
 		}
@@ -696,7 +696,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 		/* Don't release the LEB until after the next commit */
 		flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
 		lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			kfree(idx_gc);
 			goto out;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 7634c597088..ba13c92fdf6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -284,7 +284,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
 	}
 
 	zn = copy_znode(c, znode);
-	if (unlikely(IS_ERR(zn)))
+	if (IS_ERR(zn))
 		return zn;
 
 	if (zbr->len) {
@@ -1128,7 +1128,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
 			ubifs_assert(znode == c->zroot.znode);
 			znode = dirty_cow_znode(c, &c->zroot);
 		}
-		if (unlikely(IS_ERR(znode)) || !p)
+		if (IS_ERR(znode) || !p)
 			break;
 		ubifs_assert(path[p - 1] >= 0);
 		ubifs_assert(path[p - 1] < znode->child_cnt);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 649bec78b64..cfd31e229c8 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -446,7 +446,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		int type;
 
 		xent = ubifs_tnc_next_ent(c, &key, &nm);
-		if (unlikely(IS_ERR(xent))) {
+		if (IS_ERR(xent)) {
 			err = PTR_ERR(xent);
 			break;
 		}
-- 
cgit v1.2.3


From 746103aca2ae2b044e32a6ab06a6536652124c99 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 27 Aug 2008 12:50:57 +0300
Subject: UBIFS: inline one-line functions

'ubifs_get_lprops()' and 'ubifs_release_lprops()' basically wrap
mutex lock and unlock. We have them because we want lprops subsystem
be separate and as independent as possible. And we planned better
locking rules for lprops.

Anyway, because they are short, it is better to inline them.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lprops.c | 28 ----------------------------
 fs/ubifs/misc.h   | 27 +++++++++++++++++++++++++++
 fs/ubifs/ubifs.h  |  2 --
 3 files changed, 27 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 3659b887743..f27176e9b70 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -460,18 +460,6 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
 	}
 }
 
-/**
- * ubifs_get_lprops - get reference to LEB properties.
- * @c: the UBIFS file-system description object
- *
- * This function locks lprops. Lprops have to be unlocked by
- * 'ubifs_release_lprops()'.
- */
-void ubifs_get_lprops(struct ubifs_info *c)
-{
-	mutex_lock(&c->lp_mutex);
-}
-
 /**
  * calc_dark - calculate LEB dark space size.
  * @c: the UBIFS file-system description object
@@ -642,22 +630,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 	return lprops;
 }
 
-/**
- * ubifs_release_lprops - release lprops lock.
- * @c: the UBIFS file-system description object
- *
- * This function has to be called after each 'ubifs_get_lprops()' call to
- * unlock lprops.
- */
-void ubifs_release_lprops(struct ubifs_info *c)
-{
-	ubifs_assert(mutex_is_locked(&c->lp_mutex));
-	ubifs_assert(c->lst.empty_lebs >= 0 &&
-		     c->lst.empty_lebs <= c->main_lebs);
-
-	mutex_unlock(&c->lp_mutex);
-}
-
 /**
  * ubifs_get_lp_stats - get lprops statistics.
  * @c: UBIFS file-system description object
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4c12a9215d7..4fa81d867e4 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -310,4 +310,31 @@ static inline int ubifs_tnc_lookup(struct ubifs_info *c,
 	return ubifs_tnc_locate(c, key, node, NULL, NULL);
 }
 
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+static inline void ubifs_get_lprops(struct ubifs_info *c)
+{
+	mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+static inline void ubifs_release_lprops(struct ubifs_info *c)
+{
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	mutex_unlock(&c->lp_mutex);
+}
+
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 17c620b93ee..ce8654928aa 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1586,12 +1586,10 @@ int ubifs_lpt_post_commit(struct ubifs_info *c);
 void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
 
 /* lprops.c */
-void ubifs_get_lprops(struct ubifs_info *c);
 const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 					   const struct ubifs_lprops *lp,
 					   int free, int dirty, int flags,
 					   int idx_gc_cnt);
-void ubifs_release_lprops(struct ubifs_info *c);
 void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
 void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
 		      int cat);
-- 
cgit v1.2.3


From a70948b564e9f6cb81115c606d46f5b74a77b7c2 Mon Sep 17 00:00:00 2001
From: Julien Brunel <brunel@diku.dk>
Date: Fri, 29 Aug 2008 11:08:32 +0200
Subject: UBIFS: use an IS_ERR test rather than a NULL test

In case of error, the function kthread_create returns an ERR pointer,
but never returns a NULL pointer. So a NULL test that comes before an
IS_ERR test should be deleted.

The semantic match that finds this problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@match_bad_null_test@
expression x, E;
statement S1,S2;
@@
x = kthread_create(...)
... when != x = E
* if (x == NULL)
S1 else S2
// </smpl>

Signed-off-by: Julien Brunel <brunel@diku.dk>
Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 667c72d8a5c..d87b0cf5f66 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1032,8 +1032,6 @@ static int mount_ubifs(struct ubifs_info *c)
 
 		/* Create background thread */
 		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
-		if (!c->bgt)
-			c->bgt = ERR_PTR(-EINVAL);
 		if (IS_ERR(c->bgt)) {
 			err = PTR_ERR(c->bgt);
 			c->bgt = NULL;
@@ -1347,8 +1345,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 
 	/* Create background thread */
 	c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
-	if (!c->bgt)
-		c->bgt = ERR_PTR(-EINVAL);
 	if (IS_ERR(c->bgt)) {
 		err = PTR_ERR(c->bgt);
 		c->bgt = NULL;
-- 
cgit v1.2.3


From 4793e7c5e1c88382ead18db5ca072bac54467318 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Tue, 2 Sep 2008 16:29:46 +0300
Subject: UBIFS: add bulk-read facility

Some flash media are capable of reading sequentially at faster rates.
UBIFS bulk-read facility is designed to take advantage of that, by
reading in one go consecutive data nodes that are also located
consecutively in the same LEB.

Read speed on Arm platform with OneNAND goes from 17 MiB/s to
19 MiB/s.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/file.c  | 248 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/key.h   |  22 ++++-
 fs/ubifs/super.c |  31 ++++++
 fs/ubifs/tnc.c   | 283 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/ubifs.h |  45 ++++++++-
 5 files changed, 626 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 3d698e2022b..cdcfe95cbfb 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -577,8 +577,256 @@ out:
 	return copied;
 }
 
+/**
+ * populate_page - copy data nodes into a page for bulk-read.
+ * @c: UBIFS file-system description object
+ * @page: page
+ * @bu: bulk-read information
+ * @n: next zbranch slot
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int populate_page(struct ubifs_info *c, struct page *page,
+			 struct bu_info *bu, int *n)
+{
+	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 1, read = 0;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	unsigned int page_block;
+	void *addr, *zaddr;
+	pgoff_t end_index;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+
+	addr = zaddr = kmap(page);
+
+	end_index = (i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (!i_size || page->index > end_index) {
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out_hole;
+	}
+
+	page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	while (1) {
+		int err, len, out_len, dlen;
+
+		if (nn >= bu->cnt ||
+		    key_block(c, &bu->zbranch[nn].key) != page_block)
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		else {
+			struct ubifs_data_node *dn;
+
+			dn = bu->buf + (bu->zbranch[nn].offs - offs);
+
+			ubifs_assert(dn->ch.sqnum >
+				     ubifs_inode(inode)->creat_sqnum);
+
+			len = le32_to_cpu(dn->size);
+			if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+				goto out_err;
+
+			dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+			out_len = UBIFS_BLOCK_SIZE;
+			err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+					       le16_to_cpu(dn->compr_type));
+			if (err || len != out_len)
+				goto out_err;
+
+			if (len < UBIFS_BLOCK_SIZE)
+				memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+			nn += 1;
+			hole = 0;
+			read = (i << UBIFS_BLOCK_SHIFT) + len;
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		addr += UBIFS_BLOCK_SIZE;
+		page_block += 1;
+	}
+
+	if (end_index == page->index) {
+		int len = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (len < read)
+			memset(zaddr + len, 0, read - len);
+	}
+
+out_hole:
+	if (hole) {
+		SetPageChecked(page);
+		dbg_gen("hole");
+	}
+
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	*n = nn;
+	return 0;
+
+out_err:
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	ubifs_err("bad data node (block %u, inode %lu)",
+		  page_block, inode->i_ino);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_do_bulk_read - do bulk-read.
+ * @c: UBIFS file-system description object
+ * @page1: first page
+ *
+ * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ */
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
+{
+	pgoff_t offset = page1->index, end_index;
+	struct address_space *mapping = page1->mapping;
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct bu_info *bu;
+	int err, page_idx, page_cnt, ret = 0, n = 0;
+	loff_t isize;
+
+	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS);
+	if (!bu)
+		return 0;
+
+	bu->buf_len = c->bulk_read_buf_size;
+	bu->buf = kmalloc(bu->buf_len, GFP_NOFS);
+	if (!bu->buf)
+		goto out_free;
+
+	data_key_init(c, &bu->key, inode->i_ino,
+		      offset << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+
+	err = ubifs_tnc_get_bu_keys(c, bu);
+	if (err)
+		goto out_warn;
+
+	if (bu->eof) {
+		/* Turn off bulk-read at the end of the file */
+		ui->read_in_a_row = 1;
+		ui->bulk_read = 0;
+	}
+
+	page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	if (!page_cnt) {
+		/*
+		 * This happens when there are multiple blocks per page and the
+		 * blocks for the first page we are looking for, are not
+		 * together. If all the pages were like this, bulk-read would
+		 * reduce performance, so we turn it off for a while.
+		 */
+		ui->read_in_a_row = 0;
+		ui->bulk_read = 0;
+		goto out_free;
+	}
+
+	if (bu->cnt) {
+		err = ubifs_tnc_bulk_read(c, bu);
+		if (err)
+			goto out_warn;
+	}
+
+	err = populate_page(c, page1, bu, &n);
+	if (err)
+		goto out_warn;
+
+	unlock_page(page1);
+	ret = 1;
+
+	isize = i_size_read(inode);
+	if (isize == 0)
+		goto out_free;
+	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	for (page_idx = 1; page_idx < page_cnt; page_idx++) {
+		pgoff_t page_offset = offset + page_idx;
+		struct page *page;
+
+		if (page_offset > end_index)
+			break;
+		page = find_or_create_page(mapping, page_offset,
+					   GFP_NOFS | __GFP_COLD);
+		if (!page)
+			break;
+		if (!PageUptodate(page))
+			err = populate_page(c, page, bu, &n);
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			break;
+	}
+
+	ui->last_page_read = offset + page_idx - 1;
+
+out_free:
+	kfree(bu->buf);
+	kfree(bu);
+	return ret;
+
+out_warn:
+	ubifs_warn("ignoring error %d and skipping bulk-read", err);
+	goto out_free;
+}
+
+/**
+ * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
+ * @page: page from which to start bulk-read.
+ *
+ * Some flash media are capable of reading sequentially at faster rates. UBIFS
+ * bulk-read facility is designed to take advantage of that, by reading in one
+ * go consecutive data nodes that are also located consecutively in the same
+ * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ */
+static int ubifs_bulk_read(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = page->index, last_page_read = ui->last_page_read;
+	int ret = 0;
+
+	ui->last_page_read = index;
+
+	if (!c->bulk_read)
+		return 0;
+	/*
+	 * Bulk-read is protected by ui_mutex, but it is an optimization, so
+	 * don't bother if we cannot lock the mutex.
+	 */
+	if (!mutex_trylock(&ui->ui_mutex))
+		return 0;
+	if (index != last_page_read + 1) {
+		/* Turn off bulk-read if we stop reading sequentially */
+		ui->read_in_a_row = 1;
+		if (ui->bulk_read)
+			ui->bulk_read = 0;
+		goto out_unlock;
+	}
+	if (!ui->bulk_read) {
+		ui->read_in_a_row += 1;
+		if (ui->read_in_a_row < 3)
+			goto out_unlock;
+		/* Three reads in a row, so switch on bulk-read */
+		ui->bulk_read = 1;
+	}
+	ret = ubifs_do_bulk_read(c, page);
+out_unlock:
+	mutex_unlock(&ui->ui_mutex);
+	return ret;
+}
+
 static int ubifs_readpage(struct file *file, struct page *page)
 {
+	if (ubifs_bulk_read(page))
+		return 0;
 	do_readpage(page);
 	unlock_page(page);
 	return 0;
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 8f747600754..9ee65086f62 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -484,7 +484,7 @@ static inline void key_copy(const struct ubifs_info *c,
  * @key2: the second key to compare
  *
  * This function compares 2 keys and returns %-1 if @key1 is less than
- * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2.
  */
 static inline int keys_cmp(const struct ubifs_info *c,
 			   const union ubifs_key *key1,
@@ -502,6 +502,26 @@ static inline int keys_cmp(const struct ubifs_info *c,
 	return 0;
 }
 
+/**
+ * keys_eq - determine if keys are equivalent.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and
+ * %0 if not.
+ */
+static inline int keys_eq(const struct ubifs_info *c,
+			  const union ubifs_key *key1,
+			  const union ubifs_key *key2)
+{
+	if (key1->u32[0] != key2->u32[0])
+		return 0;
+	if (key1->u32[1] != key2->u32[1])
+		return 0;
+	return 1;
+}
+
 /**
  * is_hash_key - is a key vulnerable to hash collisions.
  * @c: UBIFS file-system description object
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d87b0cf5f66..b1c57e8ee85 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -401,6 +401,11 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 	else if (c->mount_opts.unmount_mode == 1)
 		seq_printf(s, ",norm_unmount");
 
+	if (c->mount_opts.bulk_read == 2)
+		seq_printf(s, ",bulk_read");
+	else if (c->mount_opts.bulk_read == 1)
+		seq_printf(s, ",no_bulk_read");
+
 	return 0;
 }
 
@@ -538,6 +543,18 @@ static int init_constants_early(struct ubifs_info *c)
 	 * calculations when reporting free space.
 	 */
 	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+	/* Buffer size for bulk-reads */
+	c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+	if (c->bulk_read_buf_size > c->leb_size)
+		c->bulk_read_buf_size = c->leb_size;
+	if (c->bulk_read_buf_size > 128 * 1024) {
+		/* Check if we can kmalloc more than 128KiB */
+		void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL);
+
+		kfree(try);
+		if (!try)
+			c->bulk_read_buf_size = 128 * 1024;
+	}
 	return 0;
 }
 
@@ -840,17 +857,23 @@ static int check_volume_empty(struct ubifs_info *c)
  *
  * Opt_fast_unmount: do not run a journal commit before un-mounting
  * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_bulk_read: enable bulk-reads
+ * Opt_no_bulk_read: disable bulk-reads
  * Opt_err: just end of array marker
  */
 enum {
 	Opt_fast_unmount,
 	Opt_norm_unmount,
+	Opt_bulk_read,
+	Opt_no_bulk_read,
 	Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_fast_unmount, "fast_unmount"},
 	{Opt_norm_unmount, "norm_unmount"},
+	{Opt_bulk_read, "bulk_read"},
+	{Opt_no_bulk_read, "no_bulk_read"},
 	{Opt_err, NULL},
 };
 
@@ -888,6 +911,14 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 			c->mount_opts.unmount_mode = 1;
 			c->fast_unmount = 0;
 			break;
+		case Opt_bulk_read:
+			c->mount_opts.bulk_read = 2;
+			c->bulk_read = 1;
+			break;
+		case Opt_no_bulk_read:
+			c->mount_opts.bulk_read = 1;
+			c->bulk_read = 0;
+			break;
 		default:
 			ubifs_err("unrecognized mount option \"%s\" "
 				  "or missing value", p);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ba13c92fdf6..d279012d8dd 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1491,6 +1491,289 @@ out:
 	return err;
 }
 
+/**
+ * ubifs_tnc_get_bu_keys - lookup keys for bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * Lookup consecutive data node keys for the same inode that reside
+ * consecutively in the same LEB.
+ */
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
+{
+	int n, err = 0, lnum = -1, uninitialized_var(offs);
+	int uninitialized_var(len);
+	unsigned int block = key_block(c, &bu->key);
+	struct ubifs_znode *znode;
+
+	bu->cnt = 0;
+	bu->blk_cnt = 0;
+	bu->eof = 0;
+
+	mutex_lock(&c->tnc_mutex);
+	/* Find first key */
+	err = ubifs_lookup_level0(c, &bu->key, &znode, &n);
+	if (err < 0)
+		goto out;
+	if (err) {
+		/* Key found */
+		len = znode->zbranch[n].len;
+		/* The buffer must be big enough for at least 1 node */
+		if (len > bu->buf_len) {
+			err = -EINVAL;
+			goto out;
+		}
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = znode->zbranch[n];
+		bu->blk_cnt += 1;
+		lnum = znode->zbranch[n].lnum;
+		offs = ALIGN(znode->zbranch[n].offs + len, 8);
+	}
+	while (1) {
+		struct ubifs_zbranch *zbr;
+		union ubifs_key *key;
+		unsigned int next_block;
+
+		/* Find next key */
+		err = tnc_next(c, &znode, &n);
+		if (err)
+			goto out;
+		zbr = &znode->zbranch[n];
+		key = &zbr->key;
+		/* See if there is another data key for this file */
+		if (key_inum(c, key) != key_inum(c, &bu->key) ||
+		    key_type(c, key) != UBIFS_DATA_KEY) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (lnum < 0) {
+			/* First key found */
+			lnum = zbr->lnum;
+			offs = ALIGN(zbr->offs + zbr->len, 8);
+			len = zbr->len;
+			if (len > bu->buf_len) {
+				err = -EINVAL;
+				goto out;
+			}
+		} else {
+			/*
+			 * The data nodes must be in consecutive positions in
+			 * the same LEB.
+			 */
+			if (zbr->lnum != lnum || zbr->offs != offs)
+				goto out;
+			offs += ALIGN(zbr->len, 8);
+			len = ALIGN(len, 8) + zbr->len;
+			/* Must not exceed buffer length */
+			if (len > bu->buf_len)
+				goto out;
+		}
+		/* Allow for holes */
+		next_block = key_block(c, key);
+		bu->blk_cnt += (next_block - block - 1);
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		block = next_block;
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = *zbr;
+		bu->blk_cnt += 1;
+		/* See if we have room for more */
+		if (bu->cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+	}
+out:
+	if (err == -ENOENT) {
+		bu->eof = 1;
+		err = 0;
+	}
+	bu->gc_seq = c->gc_seq;
+	mutex_unlock(&c->tnc_mutex);
+	if (err)
+		return err;
+	/*
+	 * An enormous hole could cause bulk-read to encompass too many
+	 * page cache pages, so limit the number here.
+	 */
+	if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+		bu->blk_cnt = UBIFS_MAX_BULK_READ;
+	/*
+	 * Ensure that bulk-read covers a whole number of page cache
+	 * pages.
+	 */
+	if (UBIFS_BLOCKS_PER_PAGE == 1 ||
+	    !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1)))
+		return 0;
+	if (bu->eof) {
+		/* At the end of file we can round up */
+		bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1;
+		return 0;
+	}
+	/* Exclude data nodes that do not make up a whole page cache page */
+	block = key_block(c, &bu->key) + bu->blk_cnt;
+	block &= ~(UBIFS_BLOCKS_PER_PAGE - 1);
+	while (bu->cnt) {
+		if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block)
+			break;
+		bu->cnt -= 1;
+	}
+	return 0;
+}
+
+/**
+ * read_wbuf - bulk-read from a LEB with a wbuf.
+ * @wbuf: wbuf that may overlap the read
+ * @buf: buffer into which to read
+ * @len: read length
+ * @lnum: LEB number from which to read
+ * @offs: offset from which to read
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
+		     int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int rlen, overlap;
+
+	dbg_io("LEB %d:%d, length %d", lnum, offs, len);
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(offs + len <= c->leb_size);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubi_read(c->ubi, lnum, buf, offs, len);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0)
+		/* Read everything that goes before write-buffer */
+		return ubi_read(c->ubi, lnum, buf, offs, rlen);
+
+	return 0;
+}
+
+/**
+ * validate_data_node - validate data nodes for bulk-read.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing data node to validate
+ * @zbr: zbranch of data node to validate
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int validate_data_node(struct ubifs_info *c, void *buf,
+			      struct ubifs_zbranch *zbr)
+{
+	union ubifs_key key1;
+	struct ubifs_ch *ch = buf;
+	int err, len;
+
+	if (ch->node_type != UBIFS_DATA_NODE) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, UBIFS_DATA_NODE);
+		goto out_err;
+	}
+
+	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0);
+	if (err) {
+		ubifs_err("expected node type %d", UBIFS_DATA_NODE);
+		goto out;
+	}
+
+	len = le32_to_cpu(ch->len);
+	if (len != zbr->len) {
+		ubifs_err("bad node length %d, expected %d", len, zbr->len);
+		goto out_err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, buf + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, &zbr->key, &key1)) {
+		ubifs_err("bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnc("looked for key %s found node's key %s",
+			DBGKEY(&zbr->key), DBGKEY1(&key1));
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out:
+	ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	return err;
+}
+
+/**
+ * ubifs_tnc_bulk_read - read a number of data nodes in one go.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * This functions reads and validates the data nodes that were identified by the
+ * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success,
+ * -EAGAIN to indicate a race with GC, or another negative error code on
+ * failure.
+ */
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
+{
+	int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i;
+	struct ubifs_wbuf *wbuf;
+	void *buf;
+
+	len = bu->zbranch[bu->cnt - 1].offs;
+	len += bu->zbranch[bu->cnt - 1].len - offs;
+	if (len > bu->buf_len) {
+		ubifs_err("buffer too small %d vs %d", bu->buf_len, len);
+		return -EINVAL;
+	}
+
+	/* Do the read */
+	wbuf = ubifs_get_wbuf(c, lnum);
+	if (wbuf)
+		err = read_wbuf(wbuf, bu->buf, len, lnum, offs);
+	else
+		err = ubi_read(c->ubi, lnum, bu->buf, offs, len);
+
+	/* Check for a race with GC */
+	if (maybe_leb_gced(c, lnum, bu->gc_seq))
+		return -EAGAIN;
+
+	if (err && err != -EBADMSG) {
+		ubifs_err("failed to read from LEB %d:%d, error %d",
+			  lnum, offs, err);
+		dbg_dump_stack();
+		dbg_tnc("key %s", DBGKEY(&bu->key));
+		return err;
+	}
+
+	/* Validate the nodes read */
+	buf = bu->buf;
+	for (i = 0; i < bu->cnt; i++) {
+		err = validate_data_node(c, buf, &bu->zbranch[i]);
+		if (err)
+			return err;
+		buf = buf + ALIGN(bu->zbranch[i].len, 8);
+	}
+
+	return 0;
+}
+
 /**
  * do_lookup_nm- look up a "hashed" node.
  * @c: UBIFS file-system description object
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ce8654928aa..8513239ea8a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -142,6 +142,9 @@
 /* Maximum expected tree height for use by bottom_up_buf */
 #define BOTTOM_UP_HEIGHT 64
 
+/* Maximum number of data nodes to bulk-read */
+#define UBIFS_MAX_BULK_READ 32
+
 /*
  * Lockdep classes for UBIFS inode @ui_mutex.
  */
@@ -329,8 +332,8 @@ struct ubifs_gced_idx_leb {
  * @dirty: non-zero if the inode is dirty
  * @xattr: non-zero if this is an extended attribute inode
  * @ui_mutex: serializes inode write-back with the rest of VFS operations,
- *            serializes "clean <-> dirty" state changes, protects @dirty,
- *            @ui_size, and @xattr_size
+ *            serializes "clean <-> dirty" state changes, serializes bulk-read,
+ *            protects @dirty, @ui_size, and @xattr_size
  * @ui_lock: protects @synced_i_size
  * @synced_i_size: synchronized size of inode, i.e. the value of inode size
  *                 currently stored on the flash; used only for regular file
@@ -338,6 +341,9 @@ struct ubifs_gced_idx_leb {
  * @ui_size: inode size used by UBIFS when writing to flash
  * @flags: inode flags (@UBIFS_COMPR_FL, etc)
  * @compr_type: default compression type used for this inode
+ * @last_page_read: page number of last page read (for bulk read)
+ * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
+ * @bulk_read: indicates whether bulk-read should be used
  * @data_len: length of the data attached to the inode
  * @data: inode's data
  *
@@ -385,6 +391,9 @@ struct ubifs_inode {
 	loff_t ui_size;
 	int flags;
 	int compr_type;
+	pgoff_t last_page_read;
+	pgoff_t read_in_a_row;
+	int bulk_read;
 	int data_len;
 	void *data;
 };
@@ -743,6 +752,28 @@ struct ubifs_znode {
 	struct ubifs_zbranch zbranch[];
 };
 
+/**
+ * struct bu_info - bulk-read information
+ * @key: first data node key
+ * @zbranch: zbranches of data nodes to bulk read
+ * @buf: buffer to read into
+ * @buf_len: buffer length
+ * @gc_seq: GC sequence number to detect races with GC
+ * @cnt: number of data nodes for bulk read
+ * @blk_cnt: number of data blocks including holes
+ * @oef: end of file reached
+ */
+struct bu_info {
+	union ubifs_key key;
+	struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ];
+	void *buf;
+	int buf_len;
+	int gc_seq;
+	int cnt;
+	int blk_cnt;
+	int eof;
+};
+
 /**
  * struct ubifs_node_range - node length range description data structure.
  * @len: fixed node length
@@ -862,9 +893,11 @@ struct ubifs_orphan {
 /**
  * struct ubifs_mount_opts - UBIFS-specific mount options information.
  * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ * @bulk_read: enable bulk-reads
  */
 struct ubifs_mount_opts {
 	unsigned int unmount_mode:2;
+	unsigned int bulk_read:2;
 };
 
 /**
@@ -965,6 +998,9 @@ struct ubifs_mount_opts {
  * @old_leb_cnt: count of logical eraseblocks before re-size
  * @ro_media: the underlying UBI volume is read-only
  *
+ * @bulk_read: enable bulk-reads
+ * @bulk_read_buf_size: buffer size for bulk-reads
+ *
  * @dirty_pg_cnt: number of dirty pages (not used)
  * @dirty_zn_cnt: number of dirty znodes
  * @clean_zn_cnt: number of clean znodes
@@ -1205,6 +1241,9 @@ struct ubifs_info {
 	int old_leb_cnt;
 	int ro_media;
 
+	int bulk_read;
+	int bulk_read_buf_size;
+
 	atomic_long_t dirty_pg_cnt;
 	atomic_long_t dirty_zn_cnt;
 	atomic_long_t clean_zn_cnt;
@@ -1490,6 +1529,8 @@ void destroy_old_idx(struct ubifs_info *c);
 int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
 		       int lnum, int offs);
 int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu);
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu);
 
 /* tnc_misc.c */
 struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
-- 
cgit v1.2.3


From 2953e73f1ce4b3284b409aefb9d46bbde6515c37 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 4 Sep 2008 16:26:00 +0300
Subject: UBIFS: add no_chk_data_crc mount option

UBIFS read performance can be improved by skipping the CRC
check when data nodes are read.  This option can be used if
the underlying media is considered to be highly reliable.
Note that CRCs are always checked for metadata.

Read speed on Arm platform with OneNAND goes from 19 MiB/s
to 27 MiB/s with data CRC checking disabled.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/io.c    | 11 ++++++++---
 fs/ubifs/scan.c  |  2 +-
 fs/ubifs/super.c | 34 +++++++++++++++++++++++++++++++---
 fs/ubifs/tnc.c   |  6 +++++-
 fs/ubifs/ubifs.h | 11 ++++++++++-
 5 files changed, 55 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 054363f2b20..40e2790b62c 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -74,6 +74,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
  * @quiet: print no messages
+ * @chk_crc: indicates whether to always check the CRC
  *
  * This function checks node magic number and CRC checksum. This function also
  * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -85,7 +86,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * or magic.
  */
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet)
+		     int offs, int quiet, int chk_crc)
 {
 	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
@@ -121,6 +122,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		   node_len > c->ranges[type].max_len)
 		goto out_len;
 
+	if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
+		if (c->no_chk_data_crc)
+			return 0;
+
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
 	if (crc != node_crc) {
@@ -722,7 +727,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 		goto out;
 	}
 
-	err = ubifs_check_node(c, buf, lnum, offs, 0);
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
 	if (err) {
 		ubifs_err("expected node type %d", type);
 		return err;
@@ -781,7 +786,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 		goto out;
 	}
 
-	err = ubifs_check_node(c, buf, lnum, offs, 0);
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
 	if (err) {
 		ubifs_err("expected node type %d", type);
 		return err;
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index acf5c5fffc6..0ed82479b44 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -87,7 +87,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
 
 	dbg_scan("scanning %s", dbg_ntype(ch->node_type));
 
-	if (ubifs_check_node(c, buf, lnum, offs, quiet))
+	if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
 		return SCANNED_A_CORRUPT_NODE;
 
 	if (ch->node_type == UBIFS_PAD_NODE) {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b1c57e8ee85..cf078b5cc88 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -406,6 +406,11 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 	else if (c->mount_opts.bulk_read == 1)
 		seq_printf(s, ",no_bulk_read");
 
+	if (c->mount_opts.chk_data_crc == 2)
+		seq_printf(s, ",chk_data_crc");
+	else if (c->mount_opts.chk_data_crc == 1)
+		seq_printf(s, ",no_chk_data_crc");
+
 	return 0;
 }
 
@@ -859,6 +864,8 @@ static int check_volume_empty(struct ubifs_info *c)
  * Opt_norm_unmount: run a journal commit before un-mounting
  * Opt_bulk_read: enable bulk-reads
  * Opt_no_bulk_read: disable bulk-reads
+ * Opt_chk_data_crc: check CRCs when reading data nodes
+ * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
  * Opt_err: just end of array marker
  */
 enum {
@@ -866,6 +873,8 @@ enum {
 	Opt_norm_unmount,
 	Opt_bulk_read,
 	Opt_no_bulk_read,
+	Opt_chk_data_crc,
+	Opt_no_chk_data_crc,
 	Opt_err,
 };
 
@@ -874,6 +883,8 @@ static match_table_t tokens = {
 	{Opt_norm_unmount, "norm_unmount"},
 	{Opt_bulk_read, "bulk_read"},
 	{Opt_no_bulk_read, "no_bulk_read"},
+	{Opt_chk_data_crc, "chk_data_crc"},
+	{Opt_no_chk_data_crc, "no_chk_data_crc"},
 	{Opt_err, NULL},
 };
 
@@ -919,6 +930,14 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 			c->mount_opts.bulk_read = 1;
 			c->bulk_read = 0;
 			break;
+		case Opt_chk_data_crc:
+			c->mount_opts.chk_data_crc = 2;
+			c->no_chk_data_crc = 0;
+			break;
+		case Opt_no_chk_data_crc:
+			c->mount_opts.chk_data_crc = 1;
+			c->no_chk_data_crc = 1;
+			break;
 		default:
 			ubifs_err("unrecognized mount option \"%s\" "
 				  "or missing value", p);
@@ -1027,6 +1046,8 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_free;
 	}
 
+	c->always_chk_crc = 1;
+
 	err = ubifs_read_superblock(c);
 	if (err)
 		goto out_free;
@@ -1168,6 +1189,8 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_infos;
 
+	c->always_chk_crc = 0;
+
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
 	if (mounted_read_only)
@@ -1313,6 +1336,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 
 	mutex_lock(&c->umount_mutex);
 	c->remounting_rw = 1;
+	c->always_chk_crc = 1;
 
 	/* Check for enough free space */
 	if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
@@ -1381,13 +1405,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		c->bgt = NULL;
 		ubifs_err("cannot spawn \"%s\", error %d",
 			  c->bgt_name, err);
-		return err;
+		goto out;
 	}
 	wake_up_process(c->bgt);
 
 	c->orph_buf = vmalloc(c->leb_size);
-	if (!c->orph_buf)
-		return -ENOMEM;
+	if (!c->orph_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	/* Check for enough log space */
 	lnum = c->lhead_lnum + 1;
@@ -1414,6 +1440,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	dbg_gen("re-mounted read-write");
 	c->vfs_sb->s_flags &= ~MS_RDONLY;
 	c->remounting_rw = 0;
+	c->always_chk_crc = 0;
 	mutex_unlock(&c->umount_mutex);
 	return 0;
 
@@ -1429,6 +1456,7 @@ out:
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
 	c->remounting_rw = 0;
+	c->always_chk_crc = 0;
 	mutex_unlock(&c->umount_mutex);
 	return err;
 }
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index d279012d8dd..66dc57100bd 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -470,6 +470,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 	if (node_len != len)
 		return 0;
 
+	if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
+		if (c->no_chk_data_crc)
+			return 0;
+
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
 	if (crc != node_crc)
@@ -1687,7 +1691,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
 		goto out_err;
 	}
 
-	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0);
+	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
 	if (err) {
 		ubifs_err("expected node type %d", UBIFS_DATA_NODE);
 		goto out;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8513239ea8a..d6ae3f7b2b0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -894,10 +894,12 @@ struct ubifs_orphan {
  * struct ubifs_mount_opts - UBIFS-specific mount options information.
  * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
  * @bulk_read: enable bulk-reads
+ * @chk_data_crc: check CRCs when reading data nodes
  */
 struct ubifs_mount_opts {
 	unsigned int unmount_mode:2;
 	unsigned int bulk_read:2;
+	unsigned int chk_data_crc:2;
 };
 
 /**
@@ -1001,6 +1003,9 @@ struct ubifs_mount_opts {
  * @bulk_read: enable bulk-reads
  * @bulk_read_buf_size: buffer size for bulk-reads
  *
+ * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
+ *                   recovery)
+ *
  * @dirty_pg_cnt: number of dirty pages (not used)
  * @dirty_zn_cnt: number of dirty znodes
  * @clean_zn_cnt: number of clean znodes
@@ -1138,6 +1143,7 @@ struct ubifs_mount_opts {
  * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
  * @size_tree: inode size information for recovery
  * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @always_chk_crc: always check CRCs (while mounting and remounting rw)
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg_buf: a buffer of LEB size used for debugging purposes
@@ -1244,6 +1250,8 @@ struct ubifs_info {
 	int bulk_read;
 	int bulk_read_buf_size;
 
+	int no_chk_data_crc;
+
 	atomic_long_t dirty_pg_cnt;
 	atomic_long_t dirty_zn_cnt;
 	atomic_long_t clean_zn_cnt;
@@ -1374,6 +1382,7 @@ struct ubifs_info {
 	struct ubifs_mst_node *rcvrd_mst_node;
 	struct rb_root size_tree;
 	int remounting_rw;
+	int always_chk_crc;
 	struct ubifs_mount_opts mount_opts;
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -1416,7 +1425,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs, int dtype);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet);
+		     int offs, int quiet, int chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
-- 
cgit v1.2.3


From 2242c689ecc390fb4719f595751351d1ecc5c409 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 5 Sep 2008 11:56:05 +0300
Subject: UBIFS: improve znode splitting rules

When inserting into a full znode it is split into two
znodes.  Because data node keys are usually consecutive,
it is better to try to keep them together.  This patch
does a better job of that.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/tnc.c | 54 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 66dc57100bd..e0878a431b9 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1962,7 +1962,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
 {
 	struct ubifs_znode *zn, *zi, *zp;
 	int i, keep, move, appending = 0;
-	union ubifs_key *key = &zbr->key;
+	union ubifs_key *key = &zbr->key, *key1;
 
 	ubifs_assert(n >= 0 && n <= c->fanout);
 
@@ -2003,20 +2003,33 @@ again:
 	zn->level = znode->level;
 
 	/* Decide where to split */
-	if (znode->level == 0 && n == c->fanout &&
-	    key_type(c, key) == UBIFS_DATA_KEY) {
-		union ubifs_key *key1;
-
-		/*
-		 * If this is an inode which is being appended - do not split
-		 * it because no other zbranches can be inserted between
-		 * zbranches of consecutive data nodes anyway.
-		 */
-		key1 = &znode->zbranch[n - 1].key;
-		if (key_inum(c, key1) == key_inum(c, key) &&
-		    key_type(c, key1) == UBIFS_DATA_KEY &&
-		    key_block(c, key1) == key_block(c, key) - 1)
-			appending = 1;
+	if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) {
+		/* Try not to split consecutive data keys */
+		if (n == c->fanout) {
+			key1 = &znode->zbranch[n - 1].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY)
+				appending = 1;
+		} else
+			goto check_split;
+	} else if (appending && n != c->fanout) {
+		/* Try not to split consecutive data keys */
+		appending = 0;
+check_split:
+		if (n >= (c->fanout + 1) / 2) {
+			key1 = &znode->zbranch[0].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY) {
+				key1 = &znode->zbranch[n].key;
+				if (key_inum(c, key1) != key_inum(c, key) ||
+				    key_type(c, key1) != UBIFS_DATA_KEY) {
+					keep = n;
+					move = c->fanout - keep;
+					zi = znode;
+					goto do_split;
+				}
+			}
+		}
 	}
 
 	if (appending) {
@@ -2046,6 +2059,8 @@ again:
 			zbr->znode->parent = zn;
 	}
 
+do_split:
+
 	__set_bit(DIRTY_ZNODE, &zn->flags);
 	atomic_long_inc(&c->dirty_zn_cnt);
 
@@ -2072,14 +2087,11 @@ again:
 
 	/* Insert new znode (produced by spitting) into the parent */
 	if (zp) {
-		i = n;
+		if (n == 0 && zi == znode && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
 		/* Locate insertion point */
 		n = znode->iip + 1;
-		if (appending && n != c->fanout)
-			appending = 0;
-
-		if (i == 0 && zi == znode && znode->iip == 0)
-			correct_parent_keys(c, znode);
 
 		/* Tail recursion */
 		zbr->key = zn->zbranch[0].key;
-- 
cgit v1.2.3


From ccb3eba72453a3b5aa37dda02e3a690449e3d229 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 8 Sep 2008 16:07:01 +0300
Subject: UBIFS: check data CRC when in error state

When UBIFS switches to R/O mode because of an error,
it is reasonable to enable data CRC checking.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 40e2790b62c..01682713af6 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -62,6 +62,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 {
 	if (!c->ro_media) {
 		c->ro_media = 1;
+		c->no_chk_data_crc = 0;
 		ubifs_warn("switched to read-only mode, error %d", err);
 		dbg_dump_stack();
 	}
-- 
cgit v1.2.3


From 625bf371c1522764fc1cf2981b041c5f9a19e894 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 8 Sep 2008 16:13:38 +0300
Subject: UBIFS: use bit-fields when possible

The "bulk_read" and "no_chk_data_crc" have only 2 values -
0 and 1. We already have bit-fields in corresponding data
structers, so make "bulk_read" and "no_chk_data_crc"
bit-fields as well.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/ubifs.h | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d6ae3f7b2b0..542cbafe76e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -331,6 +331,7 @@ struct ubifs_gced_idx_leb {
  *               this inode
  * @dirty: non-zero if the inode is dirty
  * @xattr: non-zero if this is an extended attribute inode
+ * @bulk_read: non-zero if bulk-read should be used
  * @ui_mutex: serializes inode write-back with the rest of VFS operations,
  *            serializes "clean <-> dirty" state changes, serializes bulk-read,
  *            protects @dirty, @ui_size, and @xattr_size
@@ -343,7 +344,6 @@ struct ubifs_gced_idx_leb {
  * @compr_type: default compression type used for this inode
  * @last_page_read: page number of last page read (for bulk read)
  * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
- * @bulk_read: indicates whether bulk-read should be used
  * @data_len: length of the data attached to the inode
  * @data: inode's data
  *
@@ -385,6 +385,7 @@ struct ubifs_inode {
 	unsigned int xattr_names;
 	unsigned int dirty:1;
 	unsigned int xattr:1;
+	unsigned int bulk_read:1;
 	struct mutex ui_mutex;
 	spinlock_t ui_lock;
 	loff_t synced_i_size;
@@ -393,7 +394,6 @@ struct ubifs_inode {
 	int compr_type;
 	pgoff_t last_page_read;
 	pgoff_t read_in_a_row;
-	int bulk_read;
 	int data_len;
 	void *data;
 };
@@ -940,6 +940,7 @@ struct ubifs_mount_opts {
  * @cmt_state: commit state
  * @cs_lock: commit state lock
  * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ *
  * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
  * @check_lpt_free: flag that indicates LPT GC may be needed
@@ -947,6 +948,9 @@ struct ubifs_mount_opts {
  *           optimization)
  * @nospace_rp: the same as @nospace, but additionally means that even reserved
  *              pool is full
+ * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
+ *                   recovery)
+ * @bulk_read: enable bulk-reads
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -970,6 +974,7 @@ struct ubifs_mount_opts {
  * @mst_node: master node
  * @mst_offs: offset of valid master node
  * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ * @bulk_read_buf_size: buffer size for bulk-reads
  *
  * @log_lebs: number of logical eraseblocks in the log
  * @log_bytes: log size in bytes
@@ -1000,12 +1005,6 @@ struct ubifs_mount_opts {
  * @old_leb_cnt: count of logical eraseblocks before re-size
  * @ro_media: the underlying UBI volume is read-only
  *
- * @bulk_read: enable bulk-reads
- * @bulk_read_buf_size: buffer size for bulk-reads
- *
- * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
- *                   recovery)
- *
  * @dirty_pg_cnt: number of dirty pages (not used)
  * @dirty_zn_cnt: number of dirty znodes
  * @clean_zn_cnt: number of clean znodes
@@ -1188,11 +1187,14 @@ struct ubifs_info {
 	int cmt_state;
 	spinlock_t cs_lock;
 	wait_queue_head_t cmt_wq;
+
 	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
 	unsigned int check_lpt_free:1;
 	unsigned int nospace:1;
 	unsigned int nospace_rp:1;
+	unsigned int no_chk_data_crc:1;
+	unsigned int bulk_read:1;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1217,6 +1219,7 @@ struct ubifs_info {
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
 	struct mutex mst_mutex;
+	int bulk_read_buf_size;
 
 	int log_lebs;
 	long long log_bytes;
@@ -1247,11 +1250,6 @@ struct ubifs_info {
 	int old_leb_cnt;
 	int ro_media;
 
-	int bulk_read;
-	int bulk_read_buf_size;
-
-	int no_chk_data_crc;
-
 	atomic_long_t dirty_pg_cnt;
 	atomic_long_t dirty_zn_cnt;
 	atomic_long_t clean_zn_cnt;
-- 
cgit v1.2.3


From 2094c334fdebbcceddf21f97cb16b144707af56e Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 5 Sep 2008 15:20:04 +0300
Subject: UBIFS: correct key comparison

The comparison was working, but more by accident than design.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/tnc_misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index a25c1cc1f8d..b48db999903 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -480,8 +480,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	}
 
 	/* Make sure the key of the read node is correct */
-	key_read(c, key, &key1);
-	if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+	key_read(c, node + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, key, &key1)) {
 		ubifs_err("bad key in node at LEB %d:%d",
 			  zbr->lnum, zbr->offs);
 		dbg_tnc("looked for key %s found node's key %s",
-- 
cgit v1.2.3


From ed382d5898ccfc3d7ba775be2f1596f6a1547935 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 5 Sep 2008 16:17:42 +0300
Subject: UBIFS: ensure data read beyond i_size is zeroed out correctly

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/file.c        | 10 ++++++++--
 fs/ubifs/ubifs-media.h |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index cdcfe95cbfb..2f20a49ba34 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -147,6 +147,12 @@ static int do_readpage(struct page *page)
 				err = ret;
 				if (err != -ENOENT)
 					break;
+			} else if (block + 1 == beyond) {
+				int dlen = le32_to_cpu(dn->size);
+				int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
+
+				if (ilen && ilen < dlen)
+					memset(addr + ilen, 0, dlen - ilen);
 			}
 		}
 		if (++i >= UBIFS_BLOCKS_PER_PAGE)
@@ -601,7 +607,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 	addr = zaddr = kmap(page);
 
-	end_index = (i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
 	if (!i_size || page->index > end_index) {
 		memset(addr, 0, PAGE_CACHE_SIZE);
 		goto out_hole;
@@ -649,7 +655,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 	if (end_index == page->index) {
 		int len = i_size & (PAGE_CACHE_SIZE - 1);
 
-		if (len < read)
+		if (len && len < read)
 			memset(zaddr + len, 0, read - len);
 	}
 
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index a9ecbd9af20..0b378042a3a 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -75,7 +75,6 @@
  */
 #define UBIFS_BLOCK_SIZE  4096
 #define UBIFS_BLOCK_SHIFT 12
-#define UBIFS_BLOCK_MASK  0x00000FFF
 
 /* UBIFS padding byte pattern (must not be first or last byte of node magic) */
 #define UBIFS_PADDING_BYTE 0xCE
-- 
cgit v1.2.3


From ba60ecabf067c8ecbde47af4d99b74ee57234d8e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 8 Sep 2008 16:38:01 +0300
Subject: UBIFS: fix races in bit-fields

We cannot store bit-fields together if the processes which
change them may race, unless we serialize them.

Thus, move the nospc and nospc_rp bit-fields eway from
the mount option/constant bit-fields, to avoid races.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/ubifs.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 542cbafe76e..c3ac5a8221f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -334,7 +334,7 @@ struct ubifs_gced_idx_leb {
  * @bulk_read: non-zero if bulk-read should be used
  * @ui_mutex: serializes inode write-back with the rest of VFS operations,
  *            serializes "clean <-> dirty" state changes, serializes bulk-read,
- *            protects @dirty, @ui_size, and @xattr_size
+ *            protects @dirty, @bulk_read, @ui_size, and @xattr_size
  * @ui_lock: protects @synced_i_size
  * @synced_i_size: synchronized size of inode, i.e. the value of inode size
  *                 currently stored on the flash; used only for regular file
@@ -944,10 +944,6 @@ struct ubifs_mount_opts {
  * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
  * @check_lpt_free: flag that indicates LPT GC may be needed
- * @nospace: non-zero if the file-system does not have flash space (used as
- *           optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- *              pool is full
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
  * @bulk_read: enable bulk-reads
@@ -1017,12 +1013,17 @@ struct ubifs_mount_opts {
  *                        but which still have to be taken into account because
  *                        the index has not been committed so far
  * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
+ *              @nospace, and @nospace_rp;
  * @min_idx_lebs: minimum number of LEBs required for the index
  * @old_idx_sz: size of index on flash
  * @calc_idx_sz: temporary variable which is used to calculate new index size
  *               (contains accurate new index size at end of TNC commit start)
  * @lst: lprops statistics
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
  *
  * @page_budget: budget for a page
  * @inode_budget: budget for an inode
@@ -1191,8 +1192,6 @@ struct ubifs_info {
 	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
 	unsigned int check_lpt_free:1;
-	unsigned int nospace:1;
-	unsigned int nospace_rp:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 
@@ -1263,6 +1262,8 @@ struct ubifs_info {
 	unsigned long long old_idx_sz;
 	unsigned long long calc_idx_sz;
 	struct ubifs_lp_stats lst;
+	unsigned int nospace:1;
+	unsigned int nospace_rp:1;
 
 	int page_budget;
 	int inode_budget;
-- 
cgit v1.2.3


From be61678b1d97c9b47bb839beb3c3f48da36af072 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 8 Sep 2008 18:08:39 +0300
Subject: UBIFS: fix commentary

Znode may refer both data nodes and indexing nodes

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/ubifs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c3ac5a8221f..49b06c9f675 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -707,8 +707,8 @@ struct ubifs_jhead {
  * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
  * @key: key
  * @znode: znode address in memory
- * @lnum: LEB number of the indexing node
- * @offs: offset of the indexing node within @lnum
+ * @lnum: LEB number of the target node (indexing node or data node)
+ * @offs: target node offset within @lnum
  * @len: target node length
  */
 struct ubifs_zbranch {
-- 
cgit v1.2.3


From b5e426e9a4a8d4ccc65a6849420d47f87c080d5d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 9 Sep 2008 11:20:35 +0300
Subject: UBIFS: update dbg_dump_inode

'dbg_dump_inode()' is quite outdated and does not print all
the fileds.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index d7f7645779f..32071ec87cd 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -222,30 +222,38 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
 {
 	const struct ubifs_inode *ui = ubifs_inode(inode);
 
-	printk(KERN_DEBUG "inode      %lu\n", inode->i_ino);
-	printk(KERN_DEBUG "size       %llu\n",
+	printk(KERN_DEBUG "Dump in-memory inode:");
+	printk(KERN_DEBUG "\tinode          %lu\n", inode->i_ino);
+	printk(KERN_DEBUG "\tsize           %llu\n",
 	       (unsigned long long)i_size_read(inode));
-	printk(KERN_DEBUG "nlink      %u\n", inode->i_nlink);
-	printk(KERN_DEBUG "uid        %u\n", (unsigned int)inode->i_uid);
-	printk(KERN_DEBUG "gid        %u\n", (unsigned int)inode->i_gid);
-	printk(KERN_DEBUG "atime      %u.%u\n",
+	printk(KERN_DEBUG "\tnlink          %u\n", inode->i_nlink);
+	printk(KERN_DEBUG "\tuid            %u\n", (unsigned int)inode->i_uid);
+	printk(KERN_DEBUG "\tgid            %u\n", (unsigned int)inode->i_gid);
+	printk(KERN_DEBUG "\tatime          %u.%u\n",
 	       (unsigned int)inode->i_atime.tv_sec,
 	       (unsigned int)inode->i_atime.tv_nsec);
-	printk(KERN_DEBUG "mtime      %u.%u\n",
+	printk(KERN_DEBUG "\tmtime          %u.%u\n",
 	       (unsigned int)inode->i_mtime.tv_sec,
 	       (unsigned int)inode->i_mtime.tv_nsec);
-	printk(KERN_DEBUG "ctime       %u.%u\n",
+	printk(KERN_DEBUG "\tctime          %u.%u\n",
 	       (unsigned int)inode->i_ctime.tv_sec,
 	       (unsigned int)inode->i_ctime.tv_nsec);
-	printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
-	printk(KERN_DEBUG "xattr_size  %u\n", ui->xattr_size);
-	printk(KERN_DEBUG "xattr_cnt   %u\n", ui->xattr_cnt);
-	printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
-	printk(KERN_DEBUG "dirty       %u\n", ui->dirty);
-	printk(KERN_DEBUG "xattr       %u\n", ui->xattr);
-	printk(KERN_DEBUG "flags       %d\n", ui->flags);
-	printk(KERN_DEBUG "compr_type  %d\n", ui->compr_type);
-	printk(KERN_DEBUG "data_len    %d\n", ui->data_len);
+	printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", ui->creat_sqnum);
+	printk(KERN_DEBUG "\txattr_size     %u\n", ui->xattr_size);
+	printk(KERN_DEBUG "\txattr_cnt      %u\n", ui->xattr_cnt);
+	printk(KERN_DEBUG "\txattr_names    %u\n", ui->xattr_names);
+	printk(KERN_DEBUG "\tdirty          %u\n", ui->dirty);
+	printk(KERN_DEBUG "\txattr          %u\n", ui->xattr);
+	printk(KERN_DEBUG "\tbulk_read      %u\n", ui->xattr);
+	printk(KERN_DEBUG "\tsynced_i_size  %llu\n",
+	       (unsigned long long)ui->synced_i_size);
+	printk(KERN_DEBUG "\tui_size        %llu\n",
+	       (unsigned long long)ui->ui_size);
+	printk(KERN_DEBUG "\tflags          %d\n", ui->flags);
+	printk(KERN_DEBUG "\tcompr_type     %d\n", ui->compr_type);
+	printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
+	printk(KERN_DEBUG "\tread_in_a_row  %lu\n", ui->read_in_a_row);
+	printk(KERN_DEBUG "\tdata_len       %d\n", ui->data_len);
 }
 
 void dbg_dump_node(const struct ubifs_info *c, const void *node)
-- 
cgit v1.2.3


From af2eb5637b88f7b8edf295ad3880706c5c30c324 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 9 Sep 2008 12:23:50 +0300
Subject: UBIFS: correct comment for commit_on_unmount

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index cf078b5cc88..dae1c62156c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1465,12 +1465,9 @@ out:
  * commit_on_unmount - commit the journal when un-mounting.
  * @c: UBIFS file-system description object
  *
- * This function is called during un-mounting and it commits the journal unless
- * the "fast unmount" mode is enabled. It also avoids committing the journal if
- * it contains too few data.
- *
- * Sometimes recovery requires the journal to be committed at least once, and
- * this function takes care about this.
+ * This function is called during un-mounting and re-mounting, and it commits
+ * the journal unless the "fast unmount" mode is enabled. It also avoids
+ * committing the journal if it contains too few data.
  */
 static void commit_on_unmount(struct ubifs_info *c)
 {
-- 
cgit v1.2.3


From 403e12ab30ab160e1015bd998f0abc1865c574e0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 9 Sep 2008 12:31:37 +0300
Subject: UBIFS: commit on sync_fs

Commit the journal when the FS is sync'ed. This will make
statfs provide better free space report. And we anyway
advice our users to sync the FS if they want better statfs
report.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index dae1c62156c..7e1f3efdf63 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -418,6 +418,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
 	struct ubifs_info *c = sb->s_fs_info;
 	int i, ret = 0, err;
+	long long bud_bytes;
 
 	if (c->jheads)
 		for (i = 0; i < c->jhead_cnt; i++) {
@@ -425,6 +426,17 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 			if (err && !ret)
 				ret = err;
 		}
+
+	/* Commit the journal unless it has too few data */
+	spin_lock(&c->buds_lock);
+	bud_bytes = c->bud_bytes;
+	spin_unlock(&c->buds_lock);
+	if (bud_bytes > c->leb_size) {
+		err = ubifs_run_commit(c);
+		if (err)
+			return err;
+	}
+
 	/*
 	 * We ought to call sync for c->ubi but it does not have one. If it had
 	 * it would in turn call mtd->sync, however mtd operations are
-- 
cgit v1.2.3


From bed79935de9a658678f44b88a097367d3b26429f Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 11 Sep 2008 14:25:44 +0300
Subject: UBIFS: allow for sync_fs when read-only

sync_fs can be called even if the file system is mounted
read-only.  Ensure the commit is not run in that case.

Reported-by: Zoltan Sogor <weth@inf.u-szeged.hu>
Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/super.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7e1f3efdf63..7fd759dde79 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -420,21 +420,22 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	int i, ret = 0, err;
 	long long bud_bytes;
 
-	if (c->jheads)
+	if (c->jheads) {
 		for (i = 0; i < c->jhead_cnt; i++) {
 			err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
 			if (err && !ret)
 				ret = err;
 		}
 
-	/* Commit the journal unless it has too few data */
-	spin_lock(&c->buds_lock);
-	bud_bytes = c->bud_bytes;
-	spin_unlock(&c->buds_lock);
-	if (bud_bytes > c->leb_size) {
-		err = ubifs_run_commit(c);
-		if (err)
-			return err;
+		/* Commit the journal unless it has too little data */
+		spin_lock(&c->buds_lock);
+		bud_bytes = c->bud_bytes;
+		spin_unlock(&c->buds_lock);
+		if (bud_bytes > c->leb_size) {
+			err = ubifs_run_commit(c);
+			if (err)
+				return err;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 46773be497a05010a2873e9ad96d739fb352c1e4 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 11 Sep 2008 12:57:49 +0300
Subject: UBIFS: improve garbage collection

Make garbage collection try to keep data nodes from the same
inode together and in ascending order.  This improves
performance when reading those nodes especially when bulk-read
is used.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/gc.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a6b633a5629..0bef6501d58 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -95,6 +95,48 @@ static int switch_gc_head(struct ubifs_info *c)
 	return err;
 }
 
+/**
+ * joinup - bring data nodes for an inode together.
+ * @c: UBIFS file-system description object
+ * @sleb: describes scanned LEB
+ * @inum: inode number
+ * @blk: block number
+ * @data: list to which to add data nodes
+ *
+ * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * them to @data if they are data nodes from @inum and have a larger block
+ * number than @blk. This function returns %0 on success and a negative error
+ * code on failure.
+ */
+static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+		  unsigned int blk, struct list_head *data)
+{
+	int err, cnt = 6, lnum = sleb->lnum, offs;
+	struct ubifs_scan_node *snod, *tmp;
+	union ubifs_key *key;
+
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		key = &snod->key;
+		if (key_inum(c, key) == inum &&
+		    key_type(c, key) == UBIFS_DATA_KEY &&
+		    key_block(c, key) > blk) {
+			offs = snod->offs;
+			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+			if (err < 0)
+				return err;
+			list_del(&snod->list);
+			if (err) {
+				list_add_tail(&snod->list, data);
+				blk = key_block(c, key);
+			} else
+				kfree(snod);
+			cnt = 6;
+		} else if (--cnt == 0)
+			break;
+	}
+	return 0;
+}
+
 /**
  * move_nodes - move nodes.
  * @c: UBIFS file-system description object
@@ -116,16 +158,21 @@ static int switch_gc_head(struct ubifs_info *c)
 static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 {
 	struct ubifs_scan_node *snod, *tmp;
-	struct list_head large, medium, small;
+	struct list_head data, large, medium, small;
 	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 	int avail, err, min = INT_MAX;
+	unsigned int blk = 0;
+	ino_t inum = 0;
 
+	INIT_LIST_HEAD(&data);
 	INIT_LIST_HEAD(&large);
 	INIT_LIST_HEAD(&medium);
 	INIT_LIST_HEAD(&small);
 
-	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		struct list_head *lst;
+	while (!list_empty(&sleb->nodes)) {
+		struct list_head *lst = sleb->nodes.next;
+
+		snod = list_entry(lst, struct ubifs_scan_node, list);
 
 		ubifs_assert(snod->type != UBIFS_IDX_NODE);
 		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -136,7 +183,6 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		if (err < 0)
 			goto out;
 
-		lst = &snod->list;
 		list_del(lst);
 		if (!err) {
 			/* The node is obsolete, remove it from the list */
@@ -145,15 +191,30 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		}
 
 		/*
-		 * Sort the list of nodes so that large nodes go first, and
-		 * small nodes go last.
+		 * Sort the list of nodes so that data nodes go first, large
+		 * nodes go second, and small nodes go last.
 		 */
-		if (snod->len > MEDIUM_NODE_WM)
-			list_add(lst, &large);
+		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+			if (inum != key_inum(c, &snod->key)) {
+				if (inum) {
+					/*
+					 * Try to move data nodes from the same
+					 * inode together.
+					 */
+					err = joinup(c, sleb, inum, blk, &data);
+					if (err)
+						goto out;
+				}
+				inum = key_inum(c, &snod->key);
+				blk = key_block(c, &snod->key);
+			}
+			list_add_tail(lst, &data);
+		} else if (snod->len > MEDIUM_NODE_WM)
+			list_add_tail(lst, &large);
 		else if (snod->len > SMALL_NODE_WM)
-			list_add(lst, &medium);
+			list_add_tail(lst, &medium);
 		else
-			list_add(lst, &small);
+			list_add_tail(lst, &small);
 
 		/* And find the smallest node */
 		if (snod->len < min)
@@ -164,6 +225,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 	 * Join the tree lists so that we'd have one roughly sorted list
 	 * ('large' will be the head of the joined list).
 	 */
+	list_splice(&data, &large);
 	list_splice(&medium, large.prev);
 	list_splice(&small, large.prev);
 
-- 
cgit v1.2.3


From 5c0013c16bd2ee08ffef1a1365622556a57218f5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 12 Sep 2008 10:34:51 +0300
Subject: UBIFS: fix bulk-read handling uptodate pages

Bulk-read skips uptodate pages but this was putting its
array index out and causing it to treat subsequent pages
as holes.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/file.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2f20a49ba34..51cf511d44d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -595,7 +595,7 @@ out:
 static int populate_page(struct ubifs_info *c, struct page *page,
 			 struct bu_info *bu, int *n)
 {
-	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 1, read = 0;
+	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
 	struct inode *inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
 	unsigned int page_block;
@@ -609,6 +609,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
 	if (!i_size || page->index > end_index) {
+		hole = 1;
 		memset(addr, 0, PAGE_CACHE_SIZE);
 		goto out_hole;
 	}
@@ -617,10 +618,10 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 	while (1) {
 		int err, len, out_len, dlen;
 
-		if (nn >= bu->cnt ||
-		    key_block(c, &bu->zbranch[nn].key) != page_block)
+		if (nn >= bu->cnt) {
+			hole = 1;
 			memset(addr, 0, UBIFS_BLOCK_SIZE);
-		else {
+		} else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
 			struct ubifs_data_node *dn;
 
 			dn = bu->buf + (bu->zbranch[nn].offs - offs);
@@ -643,8 +644,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 				memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
 
 			nn += 1;
-			hole = 0;
 			read = (i << UBIFS_BLOCK_SHIFT) + len;
+		} else if (key_block(c, &bu->zbranch[nn].key) < page_block) {
+			nn += 1;
+			continue;
+		} else {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
 		}
 		if (++i >= UBIFS_BLOCKS_PER_PAGE)
 			break;
-- 
cgit v1.2.3


From 73944a6de048c2c49422e9063e57198256efd23e Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 12 Sep 2008 18:13:31 +0300
Subject: UBIFS: add more debugging messages for LPT

Also add debugging checks for LPT size and separate
out c->check_lpt_free from unrelated bitfields.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/debug.c      |  37 ++++++++++
 fs/ubifs/debug.h      |   6 ++
 fs/ubifs/lpt.c        |   3 +-
 fs/ubifs/lpt_commit.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ubifs/ubifs.h      |  10 ++-
 5 files changed, 228 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 32071ec87cd..7186400750e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -655,6 +655,43 @@ void dbg_dump_lprops(struct ubifs_info *c)
 	}
 }
 
+void dbg_dump_lpt_info(struct ubifs_info *c)
+{
+	int i;
+
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
+	printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
+	printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
+	printk(KERN_DEBUG "\tltab_sz:       %d\n", c->ltab_sz);
+	printk(KERN_DEBUG "\tlsave_sz:      %d\n", c->lsave_sz);
+	printk(KERN_DEBUG "\tbig_lpt:       %d\n", c->big_lpt);
+	printk(KERN_DEBUG "\tlpt_hght:      %d\n", c->lpt_hght);
+	printk(KERN_DEBUG "\tpnode_cnt:     %d\n", c->pnode_cnt);
+	printk(KERN_DEBUG "\tnnode_cnt:     %d\n", c->nnode_cnt);
+	printk(KERN_DEBUG "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
+	printk(KERN_DEBUG "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
+	printk(KERN_DEBUG "\tlsave_cnt:     %d\n", c->lsave_cnt);
+	printk(KERN_DEBUG "\tspace_bits:    %d\n", c->space_bits);
+	printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+	printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+	printk(KERN_DEBUG "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
+	printk(KERN_DEBUG "\tpcnt_bits:     %d\n", c->pcnt_bits);
+	printk(KERN_DEBUG "\tlnum_bits:     %d\n", c->lnum_bits);
+	printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+	printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+	       c->nhead_lnum, c->nhead_offs);
+	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+		       c->lsave_lnum, c->lsave_offs);
+	for (i = 0; i < c->lpt_lebs; i++)
+		printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
+		       "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
+		       c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
+	spin_unlock(&dbg_lock);
+}
+
 void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
 	struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 50315fc5718..33d6b95071e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -224,6 +224,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
 void dbg_dump_budg(struct ubifs_info *c);
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_lpt_info(struct ubifs_info *c);
 void dbg_dump_leb(const struct ubifs_info *c, int lnum);
 void dbg_dump_znode(const struct ubifs_info *c,
 		    const struct ubifs_znode *znode);
@@ -249,6 +250,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_cats(struct ubifs_info *c);
 int dbg_check_ltab(struct ubifs_info *c);
+int dbg_chk_lpt_free_spc(struct ubifs_info *c);
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len);
 int dbg_check_synced_i_size(struct inode *inode);
 int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
 int dbg_check_tnc(struct ubifs_info *c, int extra);
@@ -367,6 +370,7 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_dump_budg(c)                      ({})
 #define dbg_dump_lprop(c, lp)                 ({})
 #define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_lpt_info(c)                  ({})
 #define dbg_dump_leb(c, lnum)                 ({})
 #define dbg_dump_znode(c, znode)              ({})
 #define dbg_dump_heap(c, heap, cat)           ({})
@@ -379,6 +383,8 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
+#define dbg_chk_lpt_free_spc(c)                    0
+#define dbg_chk_lpt_sz(c, action, len)             0
 #define dbg_check_synced_i_size(inode)             0
 #define dbg_check_dir_size(c, dir)                 0
 #define dbg_check_tnc(c, x)                        0
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 9ff2463177e..cd11b23a187 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -109,7 +109,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
 	c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
 	c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
 	c->lpt_sz += c->ltab_sz;
-	c->lpt_sz += c->lsave_sz;
+	if (c->big_lpt)
+		c->lpt_sz += c->lsave_sz;
 
 	/* Add wastage */
 	sz = c->lpt_sz;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5f0b83e20af..8546865a910 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -177,8 +177,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
 			return 0;
 		}
 	}
-	dbg_err("last LEB %d", *lnum);
-	dump_stack();
 	return -ENOSPC;
 }
 
@@ -193,6 +191,9 @@ static int layout_cnodes(struct ubifs_info *c)
 	int lnum, offs, len, alen, done_lsave, done_ltab, err;
 	struct ubifs_cnode *cnode;
 
+	err = dbg_chk_lpt_sz(c, 0, 0);
+	if (err)
+		return err;
 	cnode = c->lpt_cnext;
 	if (!cnode)
 		return 0;
@@ -206,6 +207,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->lsave_lnum = lnum;
 		c->lsave_offs = offs;
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	if (offs + c->ltab_sz <= c->leb_size) {
@@ -213,6 +215,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->ltab_lnum = lnum;
 		c->ltab_offs = offs;
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	do {
@@ -226,9 +229,10 @@ static int layout_cnodes(struct ubifs_info *c)
 		while (offs + len > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, alen - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -238,6 +242,7 @@ static int layout_cnodes(struct ubifs_info *c)
 				c->lsave_lnum = lnum;
 				c->lsave_offs = offs;
 				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 				continue;
 			}
 			if (!done_ltab) {
@@ -245,6 +250,7 @@ static int layout_cnodes(struct ubifs_info *c)
 				c->ltab_lnum = lnum;
 				c->ltab_offs = offs;
 				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 				continue;
 			}
 			break;
@@ -257,6 +263,7 @@ static int layout_cnodes(struct ubifs_info *c)
 			c->lpt_offs = offs;
 		}
 		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
 		cnode = cnode->cnext;
 	} while (cnode && cnode != c->lpt_cnext);
 
@@ -265,9 +272,10 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->lsave_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, alen - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -276,6 +284,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->lsave_lnum = lnum;
 		c->lsave_offs = offs;
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	/* Make sure to place LPT's own lprops table */
@@ -283,9 +292,10 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->ltab_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, alen - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -294,11 +304,23 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->ltab_lnum = lnum;
 		c->ltab_offs = offs;
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	alen = ALIGN(offs, c->min_io_size);
 	upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+	dbg_chk_lpt_sz(c, 4, alen - offs);
+	err = dbg_chk_lpt_sz(c, 3, alen);
+	if (err)
+		return err;
 	return 0;
+
+no_space:
+	ubifs_err("LPT out of space");
+	dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
+		"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+	dbg_dump_lpt_info(c);
+	return err;
 }
 
 /**
@@ -333,8 +355,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
 			*lnum = i + c->lpt_first;
 			return 0;
 		}
-	dbg_err("last LEB %d", *lnum);
-	dump_stack();
 	return -ENOSPC;
 }
 
@@ -369,12 +389,14 @@ static int write_cnodes(struct ubifs_info *c)
 		done_lsave = 1;
 		ubifs_pack_lsave(c, buf + offs, c->lsave);
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	if (offs + c->ltab_sz <= c->leb_size) {
 		done_ltab = 1;
 		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	/* Loop for each cnode */
@@ -392,10 +414,12 @@ static int write_cnodes(struct ubifs_info *c)
 						       alen, UBI_SHORTTERM);
 				if (err)
 					return err;
+				dbg_chk_lpt_sz(c, 4, alen - wlen);
 			}
+			dbg_chk_lpt_sz(c, 2, 0);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
@@ -408,12 +432,14 @@ static int write_cnodes(struct ubifs_info *c)
 				done_lsave = 1;
 				ubifs_pack_lsave(c, buf + offs, c->lsave);
 				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 				continue;
 			}
 			if (!done_ltab) {
 				done_ltab = 1;
 				ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 				continue;
 			}
 			break;
@@ -435,6 +461,7 @@ static int write_cnodes(struct ubifs_info *c)
 		clear_bit(COW_ZNODE, &cnode->flags);
 		smp_mb__after_clear_bit();
 		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
 		cnode = cnode->cnext;
 	} while (cnode && cnode != c->lpt_cnext);
 
@@ -448,9 +475,10 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
+			dbg_chk_lpt_sz(c, 2, alen - wlen);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -461,6 +489,7 @@ static int write_cnodes(struct ubifs_info *c)
 		done_lsave = 1;
 		ubifs_pack_lsave(c, buf + offs, c->lsave);
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	/* Make sure to place LPT's own lprops table */
@@ -473,9 +502,10 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
+			dbg_chk_lpt_sz(c, 2, alen - wlen);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -486,6 +516,7 @@ static int write_cnodes(struct ubifs_info *c)
 		done_ltab = 1;
 		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	/* Write remaining data in buffer */
@@ -495,6 +526,12 @@ static int write_cnodes(struct ubifs_info *c)
 	err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
 	if (err)
 		return err;
+
+	dbg_chk_lpt_sz(c, 4, alen - wlen);
+	err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size));
+	if (err)
+		return err;
+
 	c->nhead_lnum = lnum;
 	c->nhead_offs = ALIGN(offs, c->min_io_size);
 
@@ -503,7 +540,15 @@ static int write_cnodes(struct ubifs_info *c)
 	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
 	if (c->big_lpt)
 		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
 	return 0;
+
+no_space:
+	ubifs_err("LPT out of space mismatch");
+	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
+	        "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+	dbg_dump_lpt_info(c);
+	return err;
 }
 
 /**
@@ -1156,6 +1201,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c)
 	dbg_lp("");
 
 	mutex_lock(&c->lp_mutex);
+	err = dbg_chk_lpt_free_spc(c);
+	if (err)
+		goto out;
 	err = dbg_check_ltab(c);
 	if (err)
 		goto out;
@@ -1645,4 +1693,121 @@ int dbg_check_ltab(struct ubifs_info *c)
 	return 0;
 }
 
+/**
+ * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_free_spc(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+	}
+	if (free < c->lpt_sz) {
+		dbg_err("LPT space error: free %lld lpt_sz %lld",
+			free, c->lpt_sz);
+		dbg_dump_lpt_info(c);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
+ * @c: the UBIFS file-system description object
+ * @action: action
+ * @len: length written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
+{
+	long long chk_lpt_sz, lpt_sz;
+	int err = 0;
+
+	switch (action) {
+	case 0:
+		c->chk_lpt_sz = 0;
+		c->chk_lpt_sz2 = 0;
+		c->chk_lpt_lebs = 0;
+		c->chk_lpt_wastage = 0;
+		if (c->dirty_pn_cnt > c->pnode_cnt) {
+			dbg_err("dirty pnodes %d exceed max %d",
+				c->dirty_pn_cnt, c->pnode_cnt);
+			err = -EINVAL;
+		}
+		if (c->dirty_nn_cnt > c->nnode_cnt) {
+			dbg_err("dirty nnodes %d exceed max %d",
+				c->dirty_nn_cnt, c->nnode_cnt);
+			err = -EINVAL;
+		}
+		return err;
+	case 1:
+		c->chk_lpt_sz += len;
+		return 0;
+	case 2:
+		c->chk_lpt_sz += len;
+		c->chk_lpt_wastage += len;
+		c->chk_lpt_lebs += 1;
+		return 0;
+	case 3:
+		chk_lpt_sz = c->leb_size;
+		chk_lpt_sz *= c->chk_lpt_lebs;
+		chk_lpt_sz += len - c->nhead_offs;
+		if (c->chk_lpt_sz != chk_lpt_sz) {
+			dbg_err("LPT wrote %lld but space used was %lld",
+				c->chk_lpt_sz, chk_lpt_sz);
+			err = -EINVAL;
+		}
+		if (c->chk_lpt_sz > c->lpt_sz) {
+			dbg_err("LPT wrote %lld but lpt_sz is %lld",
+				c->chk_lpt_sz, c->lpt_sz);
+			err = -EINVAL;
+		}
+		if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+			dbg_err("LPT layout size %lld but wrote %lld",
+				c->chk_lpt_sz, c->chk_lpt_sz2);
+			err = -EINVAL;
+		}
+		if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+			dbg_err("LPT new nhead offs: expected %d was %d",
+				c->new_nhead_offs, len);
+			err = -EINVAL;
+		}
+		lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+		lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+		lpt_sz += c->ltab_sz;
+		if (c->big_lpt)
+			lpt_sz += c->lsave_sz;
+		if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+			dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+				c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+			err = -EINVAL;
+		}
+		if (err)
+			dbg_dump_lpt_info(c);
+		c->chk_lpt_sz2 = c->chk_lpt_sz;
+		c->chk_lpt_sz = 0;
+		c->chk_lpt_wastage = 0;
+		c->chk_lpt_lebs = 0;
+		c->new_nhead_offs = len;
+		return err;
+	case 4:
+		c->chk_lpt_sz += len;
+		c->chk_lpt_wastage += len;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 49b06c9f675..a7bd32fa15b 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -943,7 +943,6 @@ struct ubifs_mount_opts {
  *
  * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
- * @check_lpt_free: flag that indicates LPT GC may be needed
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
  * @bulk_read: enable bulk-reads
@@ -1102,6 +1101,7 @@ struct ubifs_mount_opts {
  * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
  * @dirty_nn_cnt: number of dirty nnodes
  * @dirty_pn_cnt: number of dirty pnodes
+ * @check_lpt_free: flag that indicates LPT GC may be needed
  * @lpt_sz: LPT size
  * @lpt_nod_buf: buffer for an on-flash nnode or pnode
  * @lpt_buf: buffer of LEB size used by LPT
@@ -1191,7 +1191,6 @@ struct ubifs_info {
 
 	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
-	unsigned int check_lpt_free:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 
@@ -1340,6 +1339,7 @@ struct ubifs_info {
 	int lpt_drty_flgs;
 	int dirty_nn_cnt;
 	int dirty_pn_cnt;
+	int check_lpt_free;
 	long long lpt_sz;
 	void *lpt_nod_buf;
 	void *lpt_buf;
@@ -1394,6 +1394,12 @@ struct ubifs_info {
 	unsigned long fail_timeout;
 	unsigned int fail_cnt;
 	unsigned int fail_cnt_max;
+	long long chk_lpt_sz;
+	long long chk_lpt_sz2;
+	long long chk_lpt_wastage;
+	int chk_lpt_lebs;
+	int new_nhead_lnum;
+	int new_nhead_offs;
 #endif
 };
 
-- 
cgit v1.2.3


From 63c300b68fd93a9fadc5e317d4d001b7a6985486 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Wed, 17 Sep 2008 12:11:13 +0300
Subject: UBIFS: correct condition to eliminate unecessary assignment

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/tnc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e0878a431b9..d27fd918b9c 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1600,7 +1600,7 @@ out:
 	 * An enormous hole could cause bulk-read to encompass too many
 	 * page cache pages, so limit the number here.
 	 */
-	if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+	if (bu->blk_cnt > UBIFS_MAX_BULK_READ)
 		bu->blk_cnt = UBIFS_MAX_BULK_READ;
 	/*
 	 * Ensure that bulk-read covers a whole number of page cache
-- 
cgit v1.2.3


From be2f6bd62d0d4246a9227dacbe2469e1f0eccf26 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 26 Sep 2008 12:52:21 +0300
Subject: UBIFS: check buffer length when scanning for LPT nodes

'is_a_node()' function was reading from a buffer before
checking the buffer length, resulting in an OOPS as
follows:

BUG: unable to handle kernel paging request at f8f74002
IP: [<f8f9783f>] :ubifs:ubifs_unpack_bits+0xca/0x233
*pde = 19e95067 *pte = 00000000
Oops: 0000 [#1] PREEMPT SMP
Modules linked in: ubifs ubi mtdchar bio2mtd mtd brd video output
[last unloaded: mtd]

Pid: 6414, comm: integck Not tainted (2.6.27-rc6ubifs34 #23)
EIP: 0060:[<f8f9783f>] EFLAGS: 00010246 CPU: 0
EIP is at ubifs_unpack_bits+0xca/0x233 [ubifs]
EAX: 00000000 EBX: f6090630 ECX: d9badcfc EDX: 00000000
ESI: 00000004 EDI: f8f74002 EBP: d9badcec ESP: d9badcc0
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process integck (pid: 6414, ti=d9bac000 task=f727dae0 task.ti=d9bac000)
Stack: 00000006 f7306240 00000002 00000000 d9badcfc d9badd00 0000001c 00000000
       f6090630 f6090630 f8f74000 d9badd10 f8fa1cc9 00000000 f8f74002 00000000
       f8f74002 f60fe128 f6090630 f8f74000 d9badd68 f8fa1e46 00000000 0001e000
Call Trace:
 [<f8fa1cc9>] ? is_a_node+0x30/0x90 [ubifs]
 [<f8fa1e46>] ? dbg_check_ltab+0x11d/0x5bd [ubifs]
 [<f8fa388f>] ? ubifs_lpt_start_commit+0x42/0xed3 [ubifs]
 [<c038e76a>] ? mutex_unlock+0x8/0xa
 [<f8f9625d>] ? ubifs_tnc_start_commit+0x1c8/0xedb [ubifs]
 [<f8f8d90b>] ? do_commit+0x187/0x523 [ubifs]
 [<c038e76a>] ? mutex_unlock+0x8/0xa
 [<f8f7ca17>] ? bud_wbuf_callback+0x22/0x28 [ubifs]
 [<f8f8dd1d>] ? ubifs_run_commit+0x76/0xc0 [ubifs]
 [<f8f8032c>] ? ubifs_sync_fs+0xd2/0xe6 [ubifs]
 [<c01a2e97>] ? vfs_quota_sync+0x0/0x17e
 [<c01a5ba6>] ? quota_sync_sb+0x26/0xbb
 [<c01a2e97>] ? vfs_quota_sync+0x0/0x17e
 [<c01a5c5d>] ? sync_dquots+0x22/0x12c
 [<c0173d1b>] ? __fsync_super+0x19/0x68
 [<c0173d75>] ? fsync_super+0xb/0x19
 [<c0174065>] ? generic_shutdown_super+0x22/0xe7
 [<c01a31fc>] ? vfs_quota_off+0x0/0x5fd
 [<f8f7cf4d>] ? ubifs_kill_sb+0x31/0x35 [ubifs]
 [<c01741f9>] ? deactivate_super+0x5e/0x71
 [<c0187610>] ? mntput_no_expire+0x82/0xe4
 [<c0187905>] ? sys_umount+0x4c/0x2f6
 [<c0187bc8>] ? sys_oldumount+0x19/0x1b
 [<c0103b71>] ? sysenter_do_call+0x12/0x25
 =======================
Code: c1 f8 03 8d 04 07 8b 4d e8 89 01 8b 45 e4 89 10 89 d8 89 f1 d3 e8 85 c0
      74 07 29 d6 83 fe 20 75 2a 89 d8 83 c4 20 5b 5e 5f 5d
EIP: [<f8f9783f>] ubifs_unpack_bits+0xca/0x233 [ubifs] SS:ESP 0068:d9badcc0
---[ end trace 1f02572436518c13 ]---

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/lpt_commit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8546865a910..eed5a0025d6 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1089,6 +1089,8 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
 	int pos = 0, node_type, node_len;
 	uint16_t crc, calc_crc;
 
+	if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8)
+		return 0;
 	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
 	if (node_type == UBIFS_LPT_NOT_A_NODE)
 		return 0;
-- 
cgit v1.2.3


From 769415c61191bc860f60c6edc3cb7cba24fb3218 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 16 Oct 2008 16:08:56 +0200
Subject: fuse: fix SEEK_END incorrectness

Update file size before using it in lseek(..., SEEK_END).

Reported-by: Amnon Shiloh <u3557@miso.sublimeip.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2bada6bbc31..98079aa800e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1448,6 +1448,9 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 	case SEEK_END:
+		retval = fuse_update_attributes(inode, NULL, file, NULL);
+		if (retval)
+			return retval;
 		offset += i_size_read(inode);
 		break;
 	case SEEK_CUR:
-- 
cgit v1.2.3


From 17e18ab6ff6ec44e95514c7346d2cbd0363ef640 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 16 Oct 2008 16:08:56 +0200
Subject: fuse: add missing fuse_request_free

The error handling code for the second call to fuse_request_alloc should
include freeing the result of the first one.

This bug was found by the Coccinelle project:

  http://www.emn.fr/x-info/coccinelle/

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6a84388cacf..54b1f0e1ef5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -865,7 +865,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (is_bdev) {
 		fc->destroy_req = fuse_request_alloc();
 		if (!fc->destroy_req)
-			goto err_put_root;
+			goto err_free_init_req;
 	}
 
 	mutex_lock(&fuse_mutex);
@@ -895,6 +895,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
  err_unlock:
 	mutex_unlock(&fuse_mutex);
+ err_free_init_req:
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
-- 
cgit v1.2.3


From 37194d0723b9b68b4b299b2564ca99e3d0a094c3 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: config description improvement

Make the short description of the FUSE_FS config option clearer.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9e9d70c02a0..9c43045ebbf 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -665,7 +665,7 @@ config AUTOFS4_FS
 	  N here.
 
 config FUSE_FS
-	tristate "Filesystem in Userspace support"
+	tristate "FUSE (Filesystem in Userspace) support"
 	help
 	  With FUSE it is possible to implement a fully functional filesystem
 	  in a userspace program.
-- 
cgit v1.2.3


From 29d434b39c807320fbe4bcdce0ab98a0b9fcb285 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: add include protectors

Add include protectors to include/linux/fuse.h and fs/fuse/fuse_i.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/fuse_i.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3a876076bdd..35accfdd747 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -6,6 +6,9 @@
   See the file COPYING.
 */
 
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
 #include <linux/fuse.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -655,3 +658,5 @@ void fuse_set_nowrite(struct inode *inode);
 void fuse_release_nowrite(struct inode *inode);
 
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
-- 
cgit v1.2.3


From a7c1b990f71574e077b94ce4582e2cf11cb891fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: implement nonseekable open

Let the client request nonseekable open using FOPEN_NONSEEKABLE and
call nonseekable_open() on the file if requested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 98079aa800e..34930a964b8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -101,6 +101,8 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 		file->f_op = &fuse_direct_io_file_operations;
 	if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
 		invalidate_inode_pages2(inode->i_mapping);
+	if (outarg->open_flags & FOPEN_NONSEEKABLE)
+		nonseekable_open(inode, file);
 	ff->fh = outarg->fh;
 	file->private_data = fuse_file_get(ff);
 }
-- 
cgit v1.2.3


From da27c118eb4f87f27ae8da2635b916daedf7264f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:56 -0700
Subject: fs/proc: use nr_irqs

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index b675a49c182..a2173a2a562 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -509,7 +509,7 @@ static int show_stat(struct seq_file *p, void *v)
 	struct timespec boottime;
 	unsigned int *per_irq_sum;
 
-	per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
+	per_irq_sum = kzalloc(sizeof(unsigned int)*nr_irqs, GFP_KERNEL);
 	if (!per_irq_sum)
 		return -ENOMEM;
 
@@ -531,7 +531,7 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-		for (j = 0; j < NR_IRQS; j++) {
+		for (j = 0; j < nr_irqs; j++) {
 			unsigned int temp = kstat_cpu(i).irqs[j];
 			sum += temp;
 			per_irq_sum[j] += temp;
@@ -577,7 +577,7 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
-	for (i = 0; i < NR_IRQS; i++)
+	for (i = 0; i < nr_irqs; i++)
 		seq_printf(p, " %u", per_irq_sum[i]);
 
 	seq_printf(p,
@@ -631,13 +631,13 @@ static const struct file_operations proc_stat_operations = {
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
-	return (*pos <= NR_IRQS) ? pos : NULL;
+	return (*pos <= nr_irqs) ? pos : NULL;
 }
 
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
 	(*pos)++;
-	if (*pos > NR_IRQS)
+	if (*pos > nr_irqs)
 		return NULL;
 	return pos;
 }
-- 
cgit v1.2.3


From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:09 -0700
Subject: x86: move kstat_irqs from kstat to irq_desc

based on Eric's patch ...

together mold it with dyn_array for irq_desc, will allcate kstat_irqs for
nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already.

v2: make sure system without generic_hardirqs works they don't have irq_desc
v3: fix merging
v4: [mingo@elte.hu] fix typo

[ mingo@elte.hu ] irq: build fix

fix:

 arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow':
 arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs'

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a2173a2a562..aa069acf61a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -532,7 +532,7 @@ static int show_stat(struct seq_file *p, void *v)
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		for (j = 0; j < nr_irqs; j++) {
-			unsigned int temp = kstat_cpu(i).irqs[j];
+			unsigned int temp = kstat_irqs_cpu(j, i);
 			sum += temp;
 			per_irq_sum[j] += temp;
 		}
-- 
cgit v1.2.3


From c7fb03a475bd80c642c1345d85c7c550f63514b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:12 -0700
Subject: irq, fs/proc: replace loop with nr_irqs for proc/stat

Replace another nr_irqs loop to avoid the allocation of all sparse
irq entries - use for_each_irq_desc instead.

v2: make sure arch without GENERIC_HARDIRQS works too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index aa069acf61a..c3cbabe8b38 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -30,6 +30,7 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
+#include <linux/irq.h>
 #include <linux/interrupt.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -501,17 +502,16 @@ static const struct file_operations proc_vmalloc_operations = {
 
 static int show_stat(struct seq_file *p, void *v)
 {
-	int i;
+	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 	cputime64_t guest;
 	u64 sum = 0;
 	struct timespec boottime;
-	unsigned int *per_irq_sum;
-
-	per_irq_sum = kzalloc(sizeof(unsigned int)*nr_irqs, GFP_KERNEL);
-	if (!per_irq_sum)
-		return -ENOMEM;
+	unsigned int per_irq_sum;
+#ifdef CONFIG_GENERIC_HARDIRQS
+	struct irq_desc *desc;
+#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -520,8 +520,6 @@ static int show_stat(struct seq_file *p, void *v)
 	jif = boottime.tv_sec;
 
 	for_each_possible_cpu(i) {
-		int j;
-
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
@@ -531,10 +529,12 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-		for (j = 0; j < nr_irqs; j++) {
-			unsigned int temp = kstat_irqs_cpu(j, i);
+		for_each_irq_desc(j, desc)
+		{
+			unsigned int temp;
+
+			temp = kstat_irqs_cpu(j, i);
 			sum += temp;
-			per_irq_sum[j] += temp;
 		}
 		sum += arch_irq_stat_cpu(i);
 	}
@@ -577,8 +577,23 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
-	for (i = 0; i < nr_irqs; i++)
-		seq_printf(p, " %u", per_irq_sum[i]);
+	/* sum again ? it could be updated? */
+	for_each_irq_desc(j, desc)
+	{
+		per_irq_sum = 0;
+		for_each_possible_cpu(i) {
+			unsigned int temp;
+
+			temp = kstat_irqs_cpu(j, i);
+			per_irq_sum += temp;
+		}
+
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+		seq_printf(p, " %u:%u", j, per_irq_sum);
+#else
+		seq_printf(p, " %u", per_irq_sum);
+#endif
+	}
 
 	seq_printf(p,
 		"\nctxt %llu\n"
@@ -592,7 +607,6 @@ static int show_stat(struct seq_file *p, void *v)
 		nr_running(),
 		nr_iowait());
 
-	kfree(per_irq_sum);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 52b17329d6d0a4824b89206803a430915031ff23 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:20 -0700
Subject: x86_64: make /proc/interrupts work with dyn irq_desc

loop with irq_desc list

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index c3cbabe8b38..72dd739a7f8 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -645,15 +645,36 @@ static const struct file_operations proc_stat_operations = {
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	struct irq_desc *desc;
+	int irq;
+	int count = *pos;
+
+	for_each_irq_desc(irq, desc) {
+		if (count-- == 0)
+			return desc;
+	}
+
+	return NULL;
+#else
 	return (*pos <= nr_irqs) ? pos : NULL;
+#endif
 }
 
+
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	struct irq_desc *desc;
+
+	desc = ((struct irq_desc *)v)->next;
 	(*pos)++;
-	if (*pos > nr_irqs)
-		return NULL;
-	return pos;
+
+	return desc;
+#else
+	(*pos)++;
+	return (*pos <= nr_irqs) ? pos : NULL;
+#endif
 }
 
 static void int_seq_stop(struct seq_file *f, void *v)
@@ -661,7 +682,6 @@ static void int_seq_stop(struct seq_file *f, void *v)
 	/* Nothing to do */
 }
 
-
 static const struct seq_operations int_seq_ops = {
 	.start = int_seq_start,
 	.next  = int_seq_next,
-- 
cgit v1.2.3


From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:22 -0700
Subject: x86: use 28 bits irq NR for pci msi/msix and ht

also print out irq no in /proc/interrups and /proc/stat in hex, so could
tell bus/dev/func.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 72dd739a7f8..d68c3592fe4 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -589,7 +589,7 @@ static int show_stat(struct seq_file *p, void *v)
 		}
 
 #ifdef CONFIG_HAVE_SPARSE_IRQ
-		seq_printf(p, " %u:%u", j, per_irq_sum);
+		seq_printf(p, " %#x:%u", j, per_irq_sum);
 #else
 		seq_printf(p, " %u", per_irq_sum);
 #endif
-- 
cgit v1.2.3


From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:16:55 +0200
Subject: genirq: remove sparse irq code

This code is not ready, but we need to rip it out instead of rebasing
as we would lose the APIC/IO_APIC unification otherwise.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/proc_misc.c | 43 +++++--------------------------------------
 1 file changed, 5 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index d68c3592fe4..3f5c7b9d1a7 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -529,13 +529,10 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+
 		for_each_irq_desc(j, desc)
-		{
-			unsigned int temp;
+			sum += kstat_irqs_cpu(j, i);
 
-			temp = kstat_irqs_cpu(j, i);
-			sum += temp;
-		}
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -578,21 +575,13 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_desc(j, desc)
-	{
+	for_each_irq_desc(j, desc) {
 		per_irq_sum = 0;
-		for_each_possible_cpu(i) {
-			unsigned int temp;
 
-			temp = kstat_irqs_cpu(j, i);
-			per_irq_sum += temp;
-		}
+		for_each_possible_cpu(i)
+			per_irq_sum += kstat_irqs_cpu(j, i);
 
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-		seq_printf(p, " %#x:%u", j, per_irq_sum);
-#else
 		seq_printf(p, " %u", per_irq_sum);
-#endif
 	}
 
 	seq_printf(p,
@@ -645,36 +634,14 @@ static const struct file_operations proc_stat_operations = {
  */
 static void *int_seq_start(struct seq_file *f, loff_t *pos)
 {
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_desc *desc;
-	int irq;
-	int count = *pos;
-
-	for_each_irq_desc(irq, desc) {
-		if (count-- == 0)
-			return desc;
-	}
-
-	return NULL;
-#else
 	return (*pos <= nr_irqs) ? pos : NULL;
-#endif
 }
 
 
 static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_desc *desc;
-
-	desc = ((struct irq_desc *)v)->next;
-	(*pos)++;
-
-	return desc;
-#else
 	(*pos)++;
 	return (*pos <= nr_irqs) ? pos : NULL;
-#endif
 }
 
 static void int_seq_stop(struct seq_file *f, void *v)
-- 
cgit v1.2.3


From 2be3b52a5785a6a5c5349fbd315f57595f7074be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 14:50:27 +0200
Subject: proc: fixup irq iterator

There is no need for irq_desc here. Even for sparse_irq we can
handle this clever in for_each_irq_nr().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/proc_misc.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 3f5c7b9d1a7..97b4579134d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -509,9 +509,6 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
-#ifdef CONFIG_GENERIC_HARDIRQS
-	struct irq_desc *desc;
-#endif
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -530,7 +527,7 @@ static int show_stat(struct seq_file *p, void *v)
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 
-		for_each_irq_desc(j, desc)
+		for_each_irq_nr(j)
 			sum += kstat_irqs_cpu(j, i);
 
 		sum += arch_irq_stat_cpu(i);
@@ -575,7 +572,7 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_desc(j, desc) {
+	for_each_irq_nr(j) {
 		per_irq_sum = 0;
 
 		for_each_possible_cpu(i)
-- 
cgit v1.2.3


From 2c1b861539c15491593625920058e06452cd3747 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 16 Oct 2008 18:35:21 +0000
Subject: [CIFS] Add nodfs mount option Older samba server (eg. 3.0.24 from
 Debian etch) don't work correctly, if DFS paths are used. Such server claim
 that they support DFS, but fail to process some requests with DFS paths.
 Starting with Linux 2.6.26, the cifs clients starts sending DFS paths in such
 situations, rendering it unuseable with older samba servers.

The nodfs mount options forces a share to be used with non DFS paths,
even if the server claims, that it supports it.

Signed-off-by: Martin Koegler <mkoegler@auto.tuwien.ac.at>
Acked-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README    | 3 +++
 fs/cifs/connect.c | 9 +++++++++
 2 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index bd2343d4c6a..cbe26fe4012 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -463,6 +463,9 @@ A partial list of the supported mount options follows:
 		with cifs style mandatory byte range locks (and most
 		cifs servers do not yet support requesting advisory
 		byte range locks).
+ nodfs          Disable DFS (global name space support) even if the
+		server claims to support it.  This can help work around
+		a problem with parsing of DFS paths with Samba 3.0.24 server.
  remount        remount the share (often used to change from ro to rw mounts
 	        or vice versa)
  cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4c13bcdb92a..17ca8ce81bb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -90,6 +90,7 @@ struct smb_vol {
 	bool nocase:1;     /* request case insensitive filenames */
 	bool nobrl:1;      /* disable sending byte range locks to srv */
 	bool seal:1;       /* request transport encryption on share */
+	bool nodfs:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -1218,6 +1219,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->sfu_emul = 1;
 		} else if (strnicmp(data, "nosfu", 5) == 0) {
 			vol->sfu_emul = 0;
+		} else if (strnicmp(data, "nodfs", 5) == 0) {
+			vol->nodfs = 1;
 		} else if (strnicmp(data, "posixpaths", 10) == 0) {
 			vol->posix_paths = 1;
 		} else if (strnicmp(data, "noposixpaths", 12) == 0) {
@@ -2197,6 +2200,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 						volume_info.UNC,
 						tcon, cifs_sb->local_nls);
 					cFYI(1, ("CIFS Tcon rc = %d", rc));
+					if (volume_info.nodfs) {
+						tcon->Flags &=
+							~SMB_SHARE_IS_IN_DFS;
+						cFYI(1, ("DFS disabled (%d)",
+							tcon->Flags));
+					}
 				}
 				if (!rc) {
 					atomic_inc(&pSesInfo->inUse);
-- 
cgit v1.2.3


From 469ee614aaa367d9cde01cbdd2027212f56c6cc6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 16 Oct 2008 18:46:39 +0000
Subject: [CIFS] eliminate usage of kthread_stop for cifsd

When cifs_demultiplex_thread was converted to a kthread based kernel
thread, great pains were taken to make it so that kthread_stop would be
used to bring it down. This just added unnecessary complexity since we
needed to use a signal anyway to break out of kernel_recvmsg.

Also, cifs_demultiplex_thread does a bit of cleanup as it's exiting, and
we need to be certain that this gets done. It's possible for a kthread
to exit before its main function is ever run if kthread_stop is called
soon after its creation. While I'm not sure that this is a real problem
with cifsd now, it could be at some point in the future if cifs_mount is
ever changed to bring down the thread quickly.

The upshot here is that using kthread_stop to bring down the thread just
adds extra complexity with no real benefit. This patch changes the code
to use the original method to bring down the thread, but still leaves it
so that the thread is actually started with kthread_run.

This seems to fix the deadlock caused by the reproducer in this bug
report:

https://bugzilla.samba.org/show_bug.cgi?id=5720

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 44 +++++++++++++++-----------------------------
 1 file changed, 15 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 17ca8ce81bb..1126f7ab460 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -125,7 +125,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	struct mid_q_entry *mid_entry;
 
 	spin_lock(&GlobalMid_Lock);
-	if (kthread_should_stop()) {
+	if (server->tcpStatus == CifsExiting) {
 		/* the demux thread will exit normally
 		next time through the loop */
 		spin_unlock(&GlobalMid_Lock);
@@ -185,7 +185,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	spin_unlock(&GlobalMid_Lock);
 	up(&server->tcpSem);
 
-	while ((!kthread_should_stop()) && (server->tcpStatus != CifsGood)) {
+	while ((server->tcpStatus != CifsExiting) &&
+	       (server->tcpStatus != CifsGood)) {
 		try_to_freeze();
 		if (server->protocolType == IPV6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
@@ -202,7 +203,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
 			spin_lock(&GlobalMid_Lock);
-			if (!kthread_should_stop())
+			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsGood;
 			server->sequence_number = 0;
 			spin_unlock(&GlobalMid_Lock);
@@ -357,7 +358,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 				GFP_KERNEL);
 
 	set_freezable();
-	while (!kthread_should_stop()) {
+	while (server->tcpStatus != CifsExiting) {
 		if (try_to_freeze())
 			continue;
 		if (bigbuf == NULL) {
@@ -398,7 +399,7 @@ incomplete_rcv:
 		    kernel_recvmsg(csocket, &smb_msg,
 				&iov, 1, pdu_length, 0 /* BB other flags? */);
 
-		if (kthread_should_stop()) {
+		if (server->tcpStatus == CifsExiting) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
 			cFYI(1, ("Reconnect after server stopped responding"));
@@ -523,7 +524,7 @@ incomplete_rcv:
 		     total_read += length) {
 			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
 						pdu_length - total_read, 0);
-			if (kthread_should_stop() ||
+			if ((server->tcpStatus == CifsExiting) ||
 			    (length == -EINTR)) {
 				/* then will exit */
 				reconnect = 2;
@@ -652,14 +653,6 @@ multi_t2_fnd:
 	spin_unlock(&GlobalMid_Lock);
 	wake_up_all(&server->response_q);
 
-	/* don't exit until kthread_stop is called */
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_UNINTERRUPTIBLE);
-	}
-	set_current_state(TASK_RUNNING);
-
 	/* check if we have blocked requests that need to free */
 	/* Note that cifs_max_pending is normally 50, but
 	can be set at module install time to as little as two */
@@ -2234,14 +2227,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			spin_lock(&GlobalMid_Lock);
 			srvTcp->tcpStatus = CifsExiting;
 			spin_unlock(&GlobalMid_Lock);
-			if (srvTcp->tsk) {
-				/* If we could verify that kthread_stop would
-				   always wake up processes blocked in
-				   tcp in recv_mesg then we could remove the
-				   send_sig call */
-				force_sig(SIGKILL, srvTcp->tsk);
-				kthread_stop(srvTcp->tsk);
-			}
+			force_sig(SIGKILL, srvTcp->tsk);
 		}
 		 /* If find_unc succeeded then rc == 0 so we can not end */
 		if (tcon)  /* up accidently freeing someone elses tcon struct */
@@ -2255,18 +2241,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					/* if the socketUseCount is now zero */
 					if ((temp_rc == -ESHUTDOWN) &&
 					    (pSesInfo->server) &&
-					    (pSesInfo->server->tsk)) {
+					    (pSesInfo->server->tsk))
 						force_sig(SIGKILL,
 							pSesInfo->server->tsk);
-						kthread_stop(pSesInfo->server->tsk);
-					}
 				} else {
 					cFYI(1, ("No session or bad tcon"));
 					if ((pSesInfo->server) &&
 					    (pSesInfo->server->tsk)) {
+						spin_lock(&GlobalMid_Lock);
+						srvTcp->tcpStatus = CifsExiting;
+						spin_unlock(&GlobalMid_Lock);
 						force_sig(SIGKILL,
 							pSesInfo->server->tsk);
-						kthread_stop(pSesInfo->server->tsk);
 					}
 				}
 				sesInfoFree(pSesInfo);
@@ -3577,10 +3563,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 				return 0;
 			} else if (rc == -ESHUTDOWN) {
 				cFYI(1, ("Waking up socket by sending signal"));
-				if (cifsd_task) {
+				if (cifsd_task)
 					force_sig(SIGKILL, cifsd_task);
-					kthread_stop(cifsd_task);
-				}
 				rc = 0;
 			} /* else - we have an smb session
 				left on this socket do not kill cifsd */
@@ -3710,7 +3694,9 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 		cERROR(1, ("Send error in SessSetup = %d", rc));
 	} else {
 		cFYI(1, ("CIFS Session Established successfully"));
+			spin_lock(&GlobalMid_Lock);
 			pSesInfo->status = CifsGood;
+			spin_unlock(&GlobalMid_Lock);
 	}
 
 ss_err_exit:
-- 
cgit v1.2.3


From dd1db2dedc4f6ad736fbba5aa6ecfe6e7c8ee0f4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 16 Oct 2008 19:27:12 -0400
Subject: cifs: don't use CREATE_DELETE_ON_CLOSE in cifs_rename_pending_delete

cifs: don't use CREATE_DELETE_ON_CLOSE in cifs_rename_pending_delete

CREATE_DELETE_ON_CLOSE apparently has different semantics than when you
set the DELETE_ON_CLOSE bit after opening the file. Setting it in the
open says "delete this file as soon as this filehandle is closed". That's
not what we want for cifs_rename_pending_delete.

Don't set this bit in the CreateFlags. Experimentation shows that
setting this flag in the SET_FILE_INFO call has no effect.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8c833345fc..fe4f2ee97b6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -784,8 +784,7 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	FILE_BASIC_INFO *info_buf;
 
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 DELETE|FILE_WRITE_ATTRIBUTES,
-			 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
+			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0)
@@ -810,17 +809,23 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 		goto out_close;
 	cifsInode->cifsAttrs = dosattr;
 
-	/* silly-rename the file */
-	CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+	/* rename the file */
+	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out;
 
 	/* set DELETE_ON_CLOSE */
 	rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
 
 	/*
 	 * some samba versions return -ENOENT when we try to set the file
-	 * disposition here. Likely a samba bug, but work around it for now
+	 * disposition here. Likely a samba bug, but work around it for now.
+	 * This means that some cifsXXX files may hang around after they
+	 * shouldn't.
+	 *
+	 * BB: remove this once fixed samba servers are in the field
 	 */
 	if (rc == -ENOENT)
 		rc = 0;
-- 
cgit v1.2.3


From faa5c2a15e14b6a4e59fcae65dec5258e723ea9f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Fri, 17 Oct 2008 16:19:45 +0200
Subject: [JFFS2] Correct parameter names of jffs2_compress() in comments

Make the parameter names of jffs2_compress() in its comments match with the
actual implementation

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/compr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 86739ee53b3..f25e70c1b51 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
 }
 
 /* jffs2_compress:
- * @data: Pointer to uncompressed data
- * @cdata: Pointer to returned pointer to buffer for compressed data
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
  * @datalen: On entry, holds the amount of data available for compression.
  *	On exit, expected to hold the amount of data actually compressed.
  * @cdatalen: On entry, holds the amount of space available for compressed
-- 
cgit v1.2.3


From 8b81ef589ad1483dd977ef47fe00d4ce4d91a0ab Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@ericvh-desktop.austin.ibm.com>
Date: Mon, 13 Oct 2008 18:45:25 -0500
Subject: 9p: consolidate transport structure

Right now there is a transport module structure which provides per-transport
type functions and data and a transport structure which contains per-instance
public data as well as function pointers to instance specific functions.

This patch moves public transport visible instance data to the client
structure (which in some cases had duplicate data) and consolidates the
functions into the transport module structure.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index c061c3f18e7..b6b85cf01e0 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -30,8 +30,8 @@
 #include <linux/parser.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
-#include <net/9p/transport.h>
 #include <net/9p/client.h>
+#include <net/9p/transport.h>
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 
-- 
cgit v1.2.3


From 0fc9655ec67ec5d4dfd08e469e0e9f0a494bf5bc Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Mon, 13 Oct 2008 20:36:17 -0500
Subject: 9p: consolidate read/write functions

Currently there are two separate versions of read and write.  One for
dealing with user buffers and the other for dealing with kernel buffers.
There is a tremendous amount of code duplication in the otherwise
identical versions of these functions.  This patch adds an additional
user buffer parameter to read and write and conditionalizes handling of
the buffer on whether the kernel buffer or the user buffer is populated.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 52944d2249a..3819a195de8 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -136,7 +136,7 @@ v9fs_file_read(struct file *filp, char __user * data, size_t count,
 
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
 	fid = filp->private_data;
-	ret = p9_client_uread(fid, data, *offset, count);
+	ret = p9_client_read(fid, NULL, data, *offset, count);
 	if (ret > 0)
 		*offset += ret;
 
@@ -164,7 +164,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		(int)count, (int)*offset);
 
 	fid = filp->private_data;
-	ret = p9_client_uwrite(fid, data, *offset, count);
+	ret = p9_client_write(fid, NULL, data, *offset, count);
 	if (ret > 0) {
 		invalidate_inode_pages2_range(inode->i_mapping, *offset,
 								*offset+ret);
-- 
cgit v1.2.3


From fbedadc16e5c888e4df9df3b1757de4993508d35 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Mon, 13 Oct 2008 20:36:16 -0500
Subject: 9p: move readn meta-function from client to fs layer

There are a couple of methods in the client code which aren't actually
wire operations.  To keep things organized cleaner, these operations are
being moved to the fs layer.

This patch moves the readn meta-function (which executes multiple wire
reads until a buffer is full) to the fs layer.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c     |  2 +-
 fs/9p/v9fs_vfs.h |  2 ++
 fs/9p/vfs_addr.c |  5 +----
 fs/9p/vfs_file.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 57 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index b6b85cf01e0..24eb01087b6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -234,7 +234,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (!v9ses->clnt->dotu)
 		v9ses->flags &= ~V9FS_EXTENDED;
 
-	v9ses->maxdata = v9ses->clnt->msize;
+	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
 	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
 	if (!v9fs_extended(v9ses) &&
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 57997fa14e6..046cff377f1 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,3 +52,5 @@ int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
 void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
+
+ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 97d3aed5798..6fcb1e7095c 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,7 +38,6 @@
 
 #include "v9fs.h"
 #include "v9fs_vfs.h"
-#include "fid.h"
 
 /**
  * v9fs_vfs_readpage - read an entire page in from 9P
@@ -53,14 +52,12 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 	int retval;
 	loff_t offset;
 	char *buffer;
-	struct p9_fid *fid;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
-	fid = filp->private_data;
 	buffer = kmap(page);
 	offset = page_offset(page);
 
-	retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE);
+	retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE);
 	if (retval < 0)
 		goto done;
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3819a195de8..4d6d7657fb7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -120,23 +120,72 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 }
 
 /**
- * v9fs_file_read - read from a file
+ * v9fs_file_readn - read from a file
  * @filp: file pointer to read
  * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
  *
  */
+
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+	       u64 offset)
+{
+	int n, total;
+	struct p9_fid *fid = filp->private_data;
+
+	P9_DPRINTK(P9_DEBUG_9P, "fid %d offset %llu count %d\n", fid->fid,
+					(long long unsigned) offset, count);
+
+	n = 0;
+	total = 0;
+	do {
+		n = p9_client_read(fid, data, udata, offset, count);
+		if (n <= 0)
+			break;
+
+		if (data)
+			data += n;
+		if (udata)
+			udata += n;
+
+		offset += n;
+		count -= n;
+		total += n;
+	} while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+
+	if (n < 0)
+		total = n;
+
+	return total;
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
 static ssize_t
-v9fs_file_read(struct file *filp, char __user * data, size_t count,
+v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 	       loff_t * offset)
 {
 	int ret;
 	struct p9_fid *fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	P9_DPRINTK(P9_DEBUG_VFS, "count %d offset %lld\n", count, *offset);
 	fid = filp->private_data;
-	ret = p9_client_read(fid, NULL, data, *offset, count);
+
+	if (count > (fid->clnt->msize - P9_IOHDRSZ))
+		ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
+	else
+		ret = p9_client_read(fid, NULL, udata, *offset, count);
+
 	if (ret > 0)
 		*offset += ret;
 
-- 
cgit v1.2.3


From dfb0ec2e13a906ff19a0bbfa9208caab50cfc2e3 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Mon, 13 Oct 2008 20:36:16 -0500
Subject: 9p: adjust 9p vfs write operation

Currently, the 9p net wire operation ensures that all data is sent by sending
multiple packets if the data requested is larger than the msize.  This is
better handled in the vfs code so that we can simplify wire operations to
being concerned with only putting data onto and taking data off of the wire.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 4d6d7657fb7..3fd28bbafc8 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -205,19 +205,38 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
 		size_t count, loff_t * offset)
 {
-	int ret;
+	int n, rsize, total = 0;
 	struct p9_fid *fid;
+	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	int origin = *offset;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
 		(int)count, (int)*offset);
 
 	fid = filp->private_data;
-	ret = p9_client_write(fid, NULL, data, *offset, count);
-	if (ret > 0) {
-		invalidate_inode_pages2_range(inode->i_mapping, *offset,
-								*offset+ret);
-		*offset += ret;
+	clnt = fid->clnt;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+		rsize = clnt->msize - P9_IOHDRSZ;
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		n = p9_client_write(fid, NULL, data+total, *offset+total,
+									rsize);
+		if (n <= 0)
+			break;
+		count -= n;
+		total += n;
+	} while (count > 0);
+
+	if (total > 0) {
+		invalidate_inode_pages2_range(inode->i_mapping, origin,
+								origin+total);
+		*offset += total;
 	}
 
 	if (*offset > inode->i_size) {
@@ -225,7 +244,10 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
 	}
 
-	return ret;
+	if (n < 0)
+		return n;
+
+	return total;
 }
 
 static const struct file_operations v9fs_cached_file_operations = {
-- 
cgit v1.2.3


From 06b55b464ee5b305aca75cb7d9424b184bf07f68 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Mon, 13 Oct 2008 20:36:15 -0500
Subject: 9p: move dirread to fs layer

Currently reading a directory is implemented in the client code.
This function is not actually a wire operation, but a meta operation
which calls read operations and processes the results.

This patch moves this functionality to the fs layer and calls component
wire operations instead of constructing their packets.  This provides a
cleaner separation and will help when we reorganize the client functions
and protocol processing methods.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c | 54 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index e298fe19409..d7d0ac5a2ca 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -69,32 +69,54 @@ static inline int dt_type(struct p9_stat *mistat)
 static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	int over;
+	struct p9_stat st;
+	int err;
 	struct p9_fid *fid;
-	struct v9fs_session_info *v9ses;
-	struct inode *inode;
-	struct p9_stat *st;
+	int buflen;
+	char *statbuf;
+	int n, i = 0;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	inode = filp->f_path.dentry->d_inode;
-	v9ses = v9fs_inode2v9ses(inode);
 	fid = filp->private_data;
-	while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) {
-		if (IS_ERR(st))
-			return PTR_ERR(st);
 
-		over = filldir(dirent, st->name.str, st->name.len, filp->f_pos,
-			v9fs_qid2ino(&st->qid), dt_type(st));
+	buflen = fid->clnt->msize - P9_IOHDRSZ;
+	statbuf = kmalloc(buflen, GFP_KERNEL);
+	if (!statbuf)
+		return -ENOMEM;
 
-		if (over)
+	while (1) {
+		err = v9fs_file_readn(filp, statbuf, NULL, fid->rdir_fpos,
+									buflen);
+		if (err <= 0)
 			break;
 
-		filp->f_pos += st->size;
-		kfree(st);
-		st = NULL;
+		n = err;
+		while (i < n) {
+			err = p9_deserialize_stat(statbuf + i, buflen-i, &st,
+							fid->clnt->dotu);
+			if (!err) {
+				err = -EIO;
+				goto free_and_exit;
+			}
+
+			i += err;
+			fid->rdir_fpos += err;
+
+			over = filldir(dirent, st.name.str, st.name.len,
+			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
+
+			filp->f_pos += st.size;
+
+			if (over) {
+				err = 0;
+				goto free_and_exit;
+			}
+		}
 	}
 
-	kfree(st);
-	return 0;
+free_and_exit:
+	kfree(statbuf);
+	return err;
 }
 
 
-- 
cgit v1.2.3


From 51a87c552dfd428e304c865e24ecbe091556f226 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Thu, 16 Oct 2008 08:30:07 -0500
Subject: 9p: rework client code to use new protocol support functions

Now that the new protocol functions are in place, this patch switches
the client code to using the new support code.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs_vfs.h  |  4 ++--
 fs/9p/vfs_dir.c   |  4 ++--
 fs/9p/vfs_file.c  |  2 +-
 fs/9p/vfs_inode.c | 38 ++++++++++++++++++--------------------
 fs/9p/vfs_super.c |  6 +++++-
 5 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 046cff377f1..c295ba786ed 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -46,10 +46,10 @@ extern struct dentry_operations v9fs_cached_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *);
+void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
+void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d7d0ac5a2ca..276aed62592 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -85,8 +85,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		return -ENOMEM;
 
 	while (1) {
-		err = v9fs_file_readn(filp, statbuf, NULL, fid->rdir_fpos,
-									buflen);
+		err = v9fs_file_readn(filp, statbuf, NULL, buflen,
+								fid->rdir_fpos);
 		if (err <= 0)
 			break;
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3fd28bbafc8..041c5269228 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -136,7 +136,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 	int n, total;
 	struct p9_fid *fid = filp->private_data;
 
-	P9_DPRINTK(P9_DEBUG_9P, "fid %d offset %llu count %d\n", fid->fid,
+	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
 					(long long unsigned) offset, count);
 
 	n = 0;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e83aa5ebe86..e96d84aafbe 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -334,7 +334,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 {
 	int err, umode;
 	struct inode *ret;
-	struct p9_stat *st;
+	struct p9_wstat *st;
 
 	ret = NULL;
 	st = p9_client_stat(fid);
@@ -417,6 +417,8 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	struct p9_fid *dfid, *ofid, *fid;
 	struct inode *inode;
 
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+
 	err = 0;
 	ofid = NULL;
 	fid = NULL;
@@ -424,6 +426,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	dfid = v9fs_fid_clone(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
 		dfid = NULL;
 		goto error;
 	}
@@ -432,18 +435,22 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	ofid = p9_client_walk(dfid, 0, NULL, 1);
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		ofid = NULL;
 		goto error;
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
-	if (err < 0)
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
 		goto error;
+	}
 
 	/* now walk from the parent so we can get unopened fid */
 	fid = p9_client_walk(dfid, 1, &name, 0);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
 	} else
@@ -453,6 +460,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
+		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
 
@@ -734,7 +742,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
-	struct p9_stat *st;
+	struct p9_wstat *st;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
@@ -815,10 +823,9 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
  */
 
 void
-v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
+v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	struct super_block *sb)
 {
-	int n;
 	char ext[32];
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
@@ -842,11 +849,7 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
 		int major = -1;
 		int minor = -1;
 
-		n = stat->extension.len;
-		if (n > sizeof(ext)-1)
-			n = sizeof(ext)-1;
-		memmove(ext, stat->extension.str, n);
-		ext[n] = 0;
+		strncpy(ext, stat->extension, sizeof(ext));
 		sscanf(ext, "%c %u %u", &type, &major, &minor);
 		switch (type) {
 		case 'c':
@@ -857,8 +860,8 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
 			break;
 		default:
 			P9_DPRINTK(P9_DEBUG_ERROR,
-				"Unknown special type %c (%.*s)\n", type,
-				stat->extension.len, stat->extension.str);
+				"Unknown special type %c %s\n", type,
+				stat->extension);
 		};
 		inode->i_rdev = MKDEV(major, minor);
 	} else
@@ -904,7 +907,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
-	struct p9_stat *st;
+	struct p9_wstat *st;
 
 	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
 	retval = -EPERM;
@@ -926,15 +929,10 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	}
 
 	/* copy extension buffer into buffer */
-	if (st->extension.len < buflen)
-		buflen = st->extension.len + 1;
-
-	memmove(buffer, st->extension.str, buflen - 1);
-	buffer[buflen-1] = 0;
+	strncpy(buffer, st->extension, buflen);
 
 	P9_DPRINTK(P9_DEBUG_VFS,
-		"%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len,
-		st->extension.str, buffer);
+		"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
 
 	retval = buflen;
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf59c396049..d6cb1a0ca72 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -111,7 +111,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	struct inode *inode = NULL;
 	struct dentry *root = NULL;
 	struct v9fs_session_info *v9ses = NULL;
-	struct p9_stat *st = NULL;
+	struct p9_wstat *st = NULL;
 	int mode = S_IRWXUGO | S_ISVTX;
 	uid_t uid = current->fsuid;
 	gid_t gid = current->fsgid;
@@ -161,10 +161,14 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	sb->s_root = root;
 	root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+
 	v9fs_stat2inode(st, root->d_inode, sb);
+
 	v9fs_fid_add(root, fid);
+	p9stat_free(st);
 	kfree(st);
 
+P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
 	return simple_set_mnt(mnt, sb);
 
 release_sb:
-- 
cgit v1.2.3


From 02da398b950c5d079c20afaa23f322383e96070a Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Thu, 16 Oct 2008 08:29:30 -0500
Subject: 9p: eliminate depricated conv functions

Remove depricated conv functions which have been replaced with new
protocol routines.

This patch also reworks the one instance of the file-system code which
directly calls conversion routines (to accomplish unpacking dirreads).

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 276aed62592..873cd31baa4 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -45,7 +45,7 @@
  *
  */
 
-static inline int dt_type(struct p9_stat *mistat)
+static inline int dt_type(struct p9_wstat *mistat)
 {
 	unsigned long perm = mistat->mode;
 	int rettype = DT_REG;
@@ -69,7 +69,7 @@ static inline int dt_type(struct p9_stat *mistat)
 static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	int over;
-	struct p9_stat st;
+	struct p9_wstat st;
 	int err;
 	struct p9_fid *fid;
 	int buflen;
@@ -92,20 +92,24 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 		n = err;
 		while (i < n) {
-			err = p9_deserialize_stat(statbuf + i, buflen-i, &st,
+			err = p9stat_read(statbuf + i, buflen-i, &st,
 							fid->clnt->dotu);
-			if (!err) {
+			if (err) {
+				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
+				p9stat_free(&st);
 				goto free_and_exit;
 			}
 
-			i += err;
-			fid->rdir_fpos += err;
+			i += st.size+2;
+			fid->rdir_fpos += st.size+2;
 
-			over = filldir(dirent, st.name.str, st.name.len,
+			over = filldir(dirent, st.name, strlen(st.name),
 			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
 
-			filp->f_pos += st.size;
+			filp->f_pos += st.size+2;
+
+			p9stat_free(&st);
 
 			if (over) {
 				err = 0;
-- 
cgit v1.2.3


From 18de9735300756e3ca9c361ef58409d8561dfe0d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 16 Oct 2008 17:41:11 -0400
Subject: NFS: Enable NFSv4 callback server to listen on AF_INET6 sockets

Allow the NFS callback server to listen for requests via an AF_INET6 or
AF_INET socket when IPv6 support is present in the kernel.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 6a09760c596..c2e9cfd9e5a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -40,6 +40,16 @@ unsigned short nfs_callback_tcpport;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
+/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const sa_family_t	nfs_callback_family = AF_INET6;
+#else
+static const sa_family_t	nfs_callback_family = AF_INET;
+#endif
+
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
 	char *endp;
@@ -106,7 +116,7 @@ int nfs_callback_up(void)
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
 	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
-				AF_INET, NULL);
+				nfs_callback_family, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
@@ -116,7 +126,8 @@ int nfs_callback_up(void)
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
-	dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
+	dprintk("NFS: Callback listener port = %u (af %u)\n",
+			nfs_callback_tcpport, nfs_callback_family);
 
 	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(nfs_callback_info.rqst)) {
@@ -149,8 +160,8 @@ out:
 	mutex_unlock(&nfs_callback_mutex);
 	return ret;
 out_err:
-	dprintk("Couldn't create callback socket or server thread; err = %d\n",
-		ret);
+	dprintk("NFS: Couldn't create callback socket or server thread; "
+		"err = %d\n", ret);
 	nfs_callback_info.users--;
 	goto out;
 }
-- 
cgit v1.2.3


From 504e518953a330c8d44a95bdd65a5c9f50f1012e Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 16 Oct 2008 14:15:16 +1100
Subject: Make nfs_file_cred more robust.

As not all files have an associated open_context (e.g. device special
files), it is safest to test for the existence of the open context
before de-referencing it.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c910413eaec..83e700a2b0c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,8 +1659,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		struct nfs_open_context *ctx;
 
 		ctx = nfs_file_open_context(sattr->ia_file);
-		cred = ctx->cred;
-		state = ctx->state;
+		if (ctx) {
+			cred = ctx->cred;
+			state = ctx->state;
+		}
 	}
 
 	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
-- 
cgit v1.2.3


From ec9a05c94c7cd00b4f0ab126c3d394f2fc2e4712 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 17 Oct 2008 10:44:37 -0400
Subject: NFS: use correct fs type for v4 submounts and referrals

Signed-off-by: Andy Adamson<andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8b28b95c9e4..a3b0061dfd4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2459,7 +2459,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -2544,7 +2544,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
-- 
cgit v1.2.3


From 57c7b4e68edf3b4fe7f977db9ad437e0f7f7c382 Mon Sep 17 00:00:00 2001
From: Magnus Deininger <dma05@web.de>
Date: Fri, 17 Oct 2008 12:44:46 -0500
Subject: 9p: fix device file handling

In v9fs_get_inode(), for block, as well as char devices (in theory),
the function init_special_inode() is called to set up callback functions
for file ops. this function uses the file mode's value to determine whether
to use block or char dev functions. In v9fs_inode_from_fid(), the function
p9mode2unixmode() is used, but for all devices it initially returns S_IFBLK,
then uses v9fs_get_inode() to initialise a new inode, then finally uses
v9fs_stat2inode(), which would determine whether the inode is a block or
character device. However, at that point init_special_inode() had already
decided to use the block device functions, so even if the inode's mode is
turned to a character device, the block functions are still used to operate
on them. The attached patch simply calls init_special_inode() again for devices
after parsing device node data in v9fs_stat2inode() so that the proper functions
are used.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e96d84aafbe..8314d3f43b7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -864,6 +864,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 				stat->extension);
 		};
 		inode->i_rdev = MKDEV(major, minor);
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 	} else
 		inode->i_rdev = 0;
 
-- 
cgit v1.2.3


From 5bf1723723487ddb0b9c9641b6559da96b27cc93 Mon Sep 17 00:00:00 2001
From: Alexander Belyakov <abelyako@mail.ru>
Date: Fri, 17 Oct 2008 19:19:13 +0400
Subject: [JFFS2] Write buffer offset adjustment for NOR-ECC (Sibley) flash

After choosing new c->nextblock, don't leave the wbuf offset field
occasionally pointing at the start of the next physical eraseblock.
This was causing a BUG() on NOR-ECC (Sibley) flash, where we start
writing after the cleanmarker.

Among other this fix should cover write buffer offset adjustment
after flushing the last page of an eraseblock.

Signed-off-by: Alexander Belyakov <abelyako@googlemail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/nodemgmt.c | 4 ++++
 fs/jffs2/wbuf.c     | 5 +----
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index a9bf9603c1b..0875b60b4bf 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 
 	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
 
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
+		c->wbuf_ofs = 0xffffffff;
+
 	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
 
 	return 0;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 0e78b00035e..d9a721e6db7 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 
 	memset(c->wbuf,0xff,c->wbuf_pagesize);
 	/* adjust write buffer offset, else we get a non contiguous write bug */
-	if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize))
-		c->wbuf_ofs += c->wbuf_pagesize;
-	else
-		c->wbuf_ofs = 0xffffffff;
+	c->wbuf_ofs += c->wbuf_pagesize;
 	c->wbuf_len = 0;
 	return 0;
 }
-- 
cgit v1.2.3


From 727d2dc045930b29dc68d56d5032d23661ba8503 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 17 Oct 2008 16:52:10 +0300
Subject: UBIFS: do not read unnecessary bytes when unpacking bits

Fixes the following Oops:

BUG: unable to handle kernel paging request at f8d24000
IP: [<f8ff0657>] :ubifs:ubifs_unpack_bits+0xcd/0x231
*pde = 34333067 *pte = 00000000
Oops: 0000 [#1] PREEMPT SMP
Modules linked in: deflate zlib_deflate lzo lzo_decompress lzo_compress
ubifs ubi nandsim nand nand_ids nand_ecc mtd nfsd lockd sunrpc exportfs
[last unloaded: nand_ecc]

Pid: 7450, comm: sync Not tainted (2.6.27-rc8-ubifs-2.6 #27)
EIP: 0060:[<f8ff0657>] EFLAGS: 00010206 CPU: 0
EIP is at ubifs_unpack_bits+0xcd/0x231 [ubifs]
EAX: 00000000 EBX: 00000000 ECX: d7e43dc0 EDX: 0000ff00
ESI: 00000004 EDI: f8d23ffe EBP: d7e43db4 ESP: d7e43d8c
DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process sync (pid: 7450, ti=d7e42000 task=eb6f9530 task.ti=d7e42000)
Stack: 00000400 c0103db4 dc5e8090 d7e43dc0 d7e43dc0 d7e43dc4 0000001c 00000004
      f496d1e0 f8d23ffc d7e43dd4 f8ffac7e f8d23ffe 00000000 f8d23ffe f2b7af68
      f496d1e0 f8d23ffc d7e43e2c f8ffadc5 00000000 0001f000 00000000 c03b10a7
Call Trace:
[<c0103db4>] ? restore_nocheck_notrace+0x0/0xe
[<f8ffac7e>] ? is_a_node+0x43/0x92 [ubifs]
[<f8ffadc5>] ? dbg_check_ltab+0xf8/0x5c9 [ubifs]
[<c03b10a7>] ? mutex_lock_nested+0x1b2/0x2a0
[<f8ffc86e>] ? ubifs_lpt_start_commit+0x49/0xecb [ubifs]
[<c03b0ef3>] ? mutex_unlock+0xd/0xf
[<f8fef017>] ? ubifs_tnc_start_commit+0x1cf/0xef8 [ubifs]
[<f8fe65d8>] ? do_commit+0x18f/0x52d [ubifs]
[<f8fe69f6>] ? ubifs_run_commit+0x80/0xca [ubifs]
[<f8fd8d35>] ? ubifs_sync_fs+0xdb/0xf6 [ubifs]
[<c0181a07>] ? sync_filesystems+0xc6/0x10c
[<c019f279>] ? do_sync+0x3b/0x6a
[<c019f2ba>] ? sys_sync+0x12/0x18
[<c0103ced>] ? sysenter_do_call+0x12/0x35
=======================
Code: 4d ec 89 01 8b 45 e8 89 10 89 d8 89 f1 d3 e8 85 c0 74 07 29 d6 83 fe
20 75 2a 89 d8 83 c4 1c 5b 5e 5f 5d c3 0f b6 57 01 c1 e2 08 <0f> b6 47 02
c1 e0 10 09 c2 0f b6 07 09 c2 0f b
EIP: [<f8ff0657>] ubifs_unpack_bits+0xcd/0x231 [ubifs] SS:ESP 0068:d7e43d8c
---[ end trace 1bbb4c407a6dd816 ]---

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/lpt.c | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index cd11b23a187..db8bd0e518b 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -288,25 +288,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
 	const int k = 32 - nrbits;
 	uint8_t *p = *addr;
 	int b = *pos;
-	uint32_t val;
+	uint32_t uninitialized_var(val);
+	const int bytes = (nrbits + b + 7) >> 3;
 
 	ubifs_assert(nrbits > 0);
 	ubifs_assert(nrbits <= 32);
 	ubifs_assert(*pos >= 0);
 	ubifs_assert(*pos < 8);
 	if (b) {
-		val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
-		      ((uint32_t)p[4] << 24);
+		switch (bytes) {
+		case 2:
+			val = p[1];
+			break;
+		case 3:
+			val = p[1] | ((uint32_t)p[2] << 8);
+			break;
+		case 4:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16);
+			break;
+		case 5:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16) |
+				     ((uint32_t)p[4] << 24);
+		}
 		val <<= (8 - b);
 		val |= *p >> b;
 		nrbits += b;
-	} else
-		val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
-		      ((uint32_t)p[3] << 24);
+	} else {
+		switch (bytes) {
+		case 1:
+			val = p[0];
+			break;
+		case 2:
+			val = p[0] | ((uint32_t)p[1] << 8);
+			break;
+		case 3:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16);
+			break;
+		case 4:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16) |
+				     ((uint32_t)p[3] << 24);
+			break;
+		}
+	}
 	val <<= k;
 	val >>= k;
 	b = nrbits & 7;
-	p += nrbits / 8;
+	p += nrbits >> 3;
 	*addr = p;
 	*pos = b;
 	ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
-- 
cgit v1.2.3


From fae7fb299f382355145c6cb4eec6aa84f4cd1fca Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 17 Oct 2008 18:49:23 +0300
Subject: UBIFS: amend printk

It is better to print "Reserved for root" than
"Reserved pool size", because it is more obvious for users
what this means.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7fd759dde79..046bf087923 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1217,7 +1217,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	ubifs_msg("media format:       %d (latest is %d)",
 		  c->fmt_version, UBIFS_FORMAT_VERSION);
 	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
-	ubifs_msg("reserved pool size: %llu bytes (%llu KiB)",
+	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
 		c->report_rp_size, c->report_rp_size >> 10);
 
 	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
-- 
cgit v1.2.3


From 54779aabb0183bbe049d2b52e96cd148366dfb0b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Fri, 17 Oct 2008 16:16:10 +0200
Subject: UBIFS: fix ubifs_compress commentary

Update the comment for ubifs_compress(), which incorrectly states that it
returnsa success/failure indicator.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/compress.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 5bb51dac3c1..a0ada596b17 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
  *
  * Note, if the input buffer was not compressed, it is copied to the output
  * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
- *
- * This functions returns %0 on success or a negative error code on failure.
  */
 void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
 		    int *compr_type)
-- 
cgit v1.2.3


From 9a8165fce724d1aba21e2c713ac6ba11dbfecafa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 17 Oct 2008 21:03:20 -0400
Subject: cifs: track DeletePending flag in cifsInodeInfo

cifs: track DeletePending flag in cifsInodeInfo

The QPathInfo call returns a flag that indicates whether DELETE_ON_CLOSE
is set. Track it in the cifsInodeInfo.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   | 1 +
 fs/cifs/cifsglob.h | 1 +
 fs/cifs/cifssmb.c  | 2 ++
 fs/cifs/inode.c    | 1 +
 4 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 25ecbd5b040..c6aad775dd6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -309,6 +309,7 @@ cifs_alloc_inode(struct super_block *sb)
 	file data or metadata */
 	cifs_inode->clientCanCacheRead = false;
 	cifs_inode->clientCanCacheAll = false;
+	cifs_inode->delete_pending = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 
 	/* Can not set i_flags here - they get immediately overwritten
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0d22479d99b..178f733a368 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -353,6 +353,7 @@ struct cifsInodeInfo {
 	bool clientCanCacheRead:1;	/* read oplock */
 	bool clientCanCacheAll:1;	/* read and writebehind oplock */
 	bool oplockPending:1;
+	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
 	struct inode vfs_inode;
 };
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6f4ffe15d68..843a85fb8b9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1309,6 +1309,7 @@ OldOpenRetry:
 				cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile));
 			pfile_info->EndOfFile = pfile_info->AllocationSize;
 			pfile_info->NumberOfLinks = cpu_to_le32(1);
+			pfile_info->DeletePending = 0;
 		}
 	}
 
@@ -1410,6 +1411,7 @@ openRetry:
 		    pfile_info->AllocationSize = pSMBr->AllocationSize;
 		    pfile_info->EndOfFile = pSMBr->EndOfFile;
 		    pfile_info->NumberOfLinks = cpu_to_le32(1);
+		    pfile_info->DeletePending = 0;
 		}
 	}
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fe4f2ee97b6..dea9eeb58b0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -506,6 +506,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	inode = *pinode;
 	cifsInfo = CIFS_I(inode);
 	cifsInfo->cifsAttrs = attr;
+	cifsInfo->delete_pending = pfindData->DeletePending ? true : false;
 	cFYI(1, ("Old time %ld", cifsInfo->time));
 	cifsInfo->time = jiffies;
 	cFYI(1, ("New time %ld", cifsInfo->time));
-- 
cgit v1.2.3


From 3270958b717a13d0228803254609c19184854b9b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 20 Oct 2008 00:44:19 +0000
Subject: [CIFS] undo changes in cifs_rename_pending_delete if it errors out

The cifs_rename_pending_delete process involves multiple steps. If it
fails and we're going to return error, we don't want to leave things in
a half-finished state. Add code to the function to undo changes if
a call fails.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES  |   8 +++++
 fs/cifs/cifsfs.h |   2 +-
 fs/cifs/inode.c  | 102 ++++++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 79 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 06e521a945c..a7255751ffb 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,10 @@
+Version 1.55
+------------
+Various fixes to make delete of open files behavior more predictable
+(when delete of an open file fails we mark the file as "delete-on-close"
+in a way that more servers accept, but only if we can first rename the
+file to a temporary name)
+
 Version 1.54
 ------------
 Fix premature write failure on congested networks (we would give up
@@ -13,6 +20,7 @@ on dns_upcall (resolving DFS referralls).  Fix plain text password
 authentication (requires setting SecurityFlags to 0x30030 to enable
 lanman and plain text though).  Fix writes to be at correct offset when
 file is open with O_APPEND and file is on a directio (forcediretio) mount.
+Fix bug in rewinding readdir directory searches.  Add nodfs mount option.
 
 Version 1.53
 ------------
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f7b4a5cd837..074de0b5064 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,5 +101,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.54"
+#define CIFS_VERSION   "1.55"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index dea9eeb58b0..232ab16d7fd 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -773,16 +773,17 @@ out:
  * anything else.
  */
 static int
-cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
+cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 {
 	int oplock = 0;
 	int rc;
 	__u16 netfid;
+	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsTconInfo *tcon = cifs_sb->tcon;
-	__u32 dosattr;
-	FILE_BASIC_INFO *info_buf;
+	__u32 dosattr, origattr;
+	FILE_BASIC_INFO *info_buf = NULL;
 
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
 			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
@@ -791,50 +792,87 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	if (rc != 0)
 		goto out;
 
-	/* set ATTR_HIDDEN and clear ATTR_READONLY */
-	cifsInode = CIFS_I(inode);
-	dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+	origattr = cifsInode->cifsAttrs;
+	if (origattr == 0)
+		origattr |= ATTR_NORMAL;
+
+	dosattr = origattr & ~ATTR_READONLY;
 	if (dosattr == 0)
 		dosattr |= ATTR_NORMAL;
 	dosattr |= ATTR_HIDDEN;
 
-	info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
-	if (info_buf == NULL) {
-		rc = -ENOMEM;
-		goto out_close;
+	/* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */
+	if (dosattr != origattr) {
+		info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+		if (info_buf == NULL) {
+			rc = -ENOMEM;
+			goto out_close;
+		}
+		info_buf->Attributes = cpu_to_le32(dosattr);
+		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+					current->tgid);
+		/* although we would like to mark the file hidden
+ 		   if that fails we will still try to rename it */
+		if (rc != 0) {
+			cifsInode->cifsAttrs = dosattr;
+		else
+			dosattr = origattr; /* since not able to change them */
 	}
-	info_buf->Attributes = cpu_to_le32(dosattr);
-	rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
-	kfree(info_buf);
-	if (rc != 0)
-		goto out_close;
-	cifsInode->cifsAttrs = dosattr;
 
 	/* rename the file */
 	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc != 0)
-		goto out;
-
-	/* set DELETE_ON_CLOSE */
-	rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
+	if (rc != 0) {
+		rc = -ETXTBSY;
+		goto undo_setattr;
+	}
 
-	/*
-	 * some samba versions return -ENOENT when we try to set the file
-	 * disposition here. Likely a samba bug, but work around it for now.
-	 * This means that some cifsXXX files may hang around after they
-	 * shouldn't.
-	 *
-	 * BB: remove this once fixed samba servers are in the field
-	 */
-	if (rc == -ENOENT)
-		rc = 0;
+	/* try to set DELETE_ON_CLOSE */
+	if (!cifsInode->delete_pending) {
+		rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+					       current->tgid);
+		/*
+		 * some samba versions return -ENOENT when we try to set the
+		 * file disposition here. Likely a samba bug, but work around
+		 * it for now. This means that some cifsXXX files may hang
+		 * around after they shouldn't.
+		 *
+		 * BB: remove this hack after more servers have the fix
+		 */
+		if (rc == -ENOENT)
+			rc = 0;
+		else if (rc != 0) {
+			rc = -ETXTBSY;
+			goto undo_rename;
+		}
+		cifsInode->delete_pending = true;
+	}
 
 out_close:
 	CIFSSMBClose(xid, tcon, netfid);
 out:
+	kfree(info_buf);
 	return rc;
+
+	/*
+	 * reset everything back to the original state. Don't bother
+	 * dealing with errors here since we can't do anything about
+	 * them anyway.
+	 */
+undo_rename:
+	CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+undo_setattr:
+	if (dosattr != origattr) {
+		info_buf->Attributes = cpu_to_le32(origattr);
+		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+					current->tgid))
+			cifsInode->cifsAttrs = origattr;
+	}
+
+	goto out_close;
 }
 
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
@@ -884,7 +922,7 @@ psx_del_no_retry:
 	} else if (rc == -ENOENT) {
 		d_drop(dentry);
 	} else if (rc == -ETXTBSY) {
-		rc = cifs_rename_pending_delete(full_path, inode, xid);
+		rc = cifs_rename_pending_delete(full_path, dentry, xid);
 		if (rc == 0)
 			drop_nlink(inode);
 	} else if (rc == -EACCES && dosattr == 0) {
-- 
cgit v1.2.3


From 4f98a2fee8acdb4ac84545df98cccecfd130f8db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:32 -0700
Subject: vmscan: split LRU lists into anon & file sets

Split the LRU lists in two, one set for pages that are backed by real file
systems ("file") and one for pages that are backed by memory and swap
("anon").  The latter includes tmpfs.

The advantage of doing this is that the VM will not have to scan over lots
of anonymous pages (which we generally do not want to swap out), just to
find the page cache pages that it should evict.

This patch has the infrastructure and a basic policy to balance how much
we scan the anon lists and how much we scan the file lists.  The big
policy changes are in separate patches.

[lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset]
[kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru]
[kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page]
[hugh@veritas.com: memcg swapbacked pages active]
[hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED]
[akpm@linux-foundation.org: fix /proc/vmstat units]
[nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration]
[kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo]
[kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/file.c        |  4 +--
 fs/nfs/dir.c          |  2 +-
 fs/ntfs/file.c        |  4 +--
 fs/proc/proc_misc.c   | 77 ++++++++++++++++++++++++++++++---------------------
 fs/ramfs/file-nommu.c |  4 +--
 5 files changed, 52 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c4a8a060512..62d8bd8f14c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		SetPageUptodate(page);
 		unlock_page(page);
 		if (!pagevec_add(plru_pvec, page))
-			__pagevec_lru_add(plru_pvec);
+			__pagevec_lru_add_file(plru_pvec);
 		data += PAGE_CACHE_SIZE;
 	}
 	return;
@@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		bytes_read = 0;
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 
 /* need to free smb_read_data buf before exit */
 	if (smb_read_data) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2ab70d46ecb..efdba2e802d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1517,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
 		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add(&lru_pvec);
+		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d020866d423..3140a4429af 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 			pages[nr] = *cached_page;
 			page_cache_get(*cached_page);
 			if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-				__pagevec_lru_add(lru_pvec);
+				__pagevec_lru_add_file(lru_pvec);
 			*cached_page = NULL;
 		}
 		index++;
@@ -2084,7 +2084,7 @@ err_out:
 						OSYNC_METADATA|OSYNC_DATA);
 		}
   	}
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 59ea42e1ef0..b8edb286055 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -136,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	unsigned long allowed;
 	struct vmalloc_info vmi;
 	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
 
 /*
  * display in kilobytes.
@@ -154,51 +156,62 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 
 	get_vmalloc_info(&vmi);
 
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
 	/*
 	 * Tagged format, for easy grepping and expansion.
 	 */
 	len = sprintf(page,
-		"MemTotal:     %8lu kB\n"
-		"MemFree:      %8lu kB\n"
-		"Buffers:      %8lu kB\n"
-		"Cached:       %8lu kB\n"
-		"SwapCached:   %8lu kB\n"
-		"Active:       %8lu kB\n"
-		"Inactive:     %8lu kB\n"
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
 #ifdef CONFIG_HIGHMEM
-		"HighTotal:    %8lu kB\n"
-		"HighFree:     %8lu kB\n"
-		"LowTotal:     %8lu kB\n"
-		"LowFree:      %8lu kB\n"
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
 #endif
-		"SwapTotal:    %8lu kB\n"
-		"SwapFree:     %8lu kB\n"
-		"Dirty:        %8lu kB\n"
-		"Writeback:    %8lu kB\n"
-		"AnonPages:    %8lu kB\n"
-		"Mapped:       %8lu kB\n"
-		"Slab:         %8lu kB\n"
-		"SReclaimable: %8lu kB\n"
-		"SUnreclaim:   %8lu kB\n"
-		"PageTables:   %8lu kB\n"
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"PageTables:     %8lu kB\n"
 #ifdef CONFIG_QUICKLIST
-		"Quicklists:   %8lu kB\n"
+		"Quicklists:     %8lu kB\n"
 #endif
-		"NFS_Unstable: %8lu kB\n"
-		"Bounce:       %8lu kB\n"
-		"WritebackTmp: %8lu kB\n"
-		"CommitLimit:  %8lu kB\n"
-		"Committed_AS: %8lu kB\n"
-		"VmallocTotal: %8lu kB\n"
-		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n",
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
 		K(cached),
 		K(total_swapcache_pages),
-		K(global_page_state(NR_ACTIVE)),
-		K(global_page_state(NR_INACTIVE)),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5145cb9125a..76acdbc3461 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 			goto add_error;
 
 		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add(&lru_pvec);
+			__pagevec_lru_add_file(&lru_pvec);
 
 		unlock_page(page);
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	return 0;
 
  fsize_exceeded:
-- 
cgit v1.2.3


From 7b854121eb3e5ba0241882ff939e2c485228c9c5 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:40 -0700
Subject: Unevictable LRU Page Statistics

Report unevictable pages per zone and system wide.

Kosaki Motohiro added support for memory controller unevictable
statistics.

[riel@redhat.com: fix printk in show_free_areas()]
[akpm@linux-foundation.org: fix units in /proc/vmstats]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index b8edb286055..6dd60eaea99 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -174,6 +174,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Inactive(anon): %8lu kB\n"
 		"Active(file):   %8lu kB\n"
 		"Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+		"Unevictable:    %8lu kB\n"
+#endif
 #ifdef CONFIG_HIGHMEM
 		"HighTotal:      %8lu kB\n"
 		"HighFree:       %8lu kB\n"
@@ -212,6 +215,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(pages[LRU_INACTIVE_ANON]),
 		K(pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_FILE]),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		K(pages[LRU_UNEVICTABLE]),
+#endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
-- 
cgit v1.2.3


From ba9ddf49391645e6bb93219131a40446538a5e76 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:42 -0700
Subject: Ramfs and Ram Disk pages are unevictable

Christoph Lameter pointed out that ram disk pages also clutter the LRU
lists.  When vmscan finds them dirty and tries to clean them, the ram disk
writeback function just redirties the page so that it goes back onto the
active list.  Round and round she goes...

With the ram disk driver [rd.c] replaced by the newer 'brd.c', this is no
longer the case, as ram disk pages are no longer maintained on the lru.
[This makes them unmigratable for defrag or memory hot remove, but that
can be addressed by a separate patch series.] However, the ramfs pages
behave like ram disk pages used to, so:

Define new address_space flag [shares address_space flags member with
mapping's gfp mask] to indicate that the address space contains all
unevictable pages.  This will provide for efficient testing of ramfs pages
in page_evictable().

Also provide wrapper functions to set/test the unevictable state to
minimize #ifdefs in ramfs driver and any other users of this facility.

Set the unevictable state on address_space structures for new ramfs
inodes.  Test the unevictable state in page_evictable() to cull
unevictable pages.

These changes depend on [CONFIG_]UNEVICTABLE_LRU.

[riel@redhat.com: undo the brd.c part]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Debugged-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b13123424e4..f031d1c925f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
-- 
cgit v1.2.3


From 5344b7e648980cc2ca613ec03a56a8222ff48820 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:51 -0700
Subject: vmstat: mlocked pages statistics

Add NR_MLOCK zone page state, which provides a (conservative) count of
mlocked pages (actually, the number of mlocked pages moved off the LRU).

Reworked by lts to fit in with the modified mlock page support in the
Reclaim Scalability series.

[kosaki.motohiro@jp.fujitsu.com: fix incorrect Mlocked field of /proc/meminfo]
[lee.schermerhorn@hp.com: mlocked-pages: add event counting with statistics]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 6dd60eaea99..61b25f4eabe 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -176,6 +176,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Inactive(file): %8lu kB\n"
 #ifdef CONFIG_UNEVICTABLE_LRU
 		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
 #endif
 #ifdef CONFIG_HIGHMEM
 		"HighTotal:      %8lu kB\n"
@@ -217,6 +218,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(pages[LRU_INACTIVE_FILE]),
 #ifdef CONFIG_UNEVICTABLE_LRU
 		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
 #endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
-- 
cgit v1.2.3


From 51b07fc3c5c830bb49c80fc5eac041e1f66a72e7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:27:00 -0700
Subject: fs: buffer lock use lock bitops

trylock_buffer and unlock_buffer open and close a critical section.
Hence, we can use the lock bitops to get the desired memory ordering.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index ac78d4c19b3..6569fda5cfe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer);
 
 void unlock_buffer(struct buffer_head *bh)
 {
-	smp_mb__before_clear_bit();
-	clear_buffer_locked(bh);
+	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
-- 
cgit v1.2.3


From e575f111dc0f27044e170580e7de50985ab3e011 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:27:08 -0700
Subject: coredump_filter: add hugepage dumping

Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped.  But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g.  vmalloc, sound related).

Thus hugepages are never dumped and it can't be debugged easily.  Many
developers want hugepages to be included into core-dump.

However, We can't read generic VM_RESERVED area because this area is often
IO mapping area.  then these area reading may change device state.  it is
definitly undesiable side-effect.

So adding a hugepage specific bit to the coredump filter is better.  It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.

In additional, libhugetlb use hugetlb private mapping pages as anonymous
page.  Then, hugepage private mapping pages should be core dumped by
default.

Then, /proc/[pid]/core_dump_filter has two new bits.

 - bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
 - bit 6 mean hugetlb shared mapping pages are dumped or not.  (default: no)

I tested by following method.

% ulimit -c unlimited
% ./crash_hugepage  50
% ./crash_hugepage  50  -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage  50
% ./crash_hugepage  50  -p
% ls -lh
% gdb ./crash_hugepage core

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>

#include "hugetlbfs.h"

int main(int argc, char** argv){
	char* p;
	int ch;
	int mmap_flags = MAP_SHARED;
	int fd;
	int nr_pages;

	while((ch = getopt(argc, argv, "p")) != -1) {
		switch (ch) {
		case 'p':
			mmap_flags &= ~MAP_SHARED;
			mmap_flags |= MAP_PRIVATE;
			break;
		default:
			/* nothing*/
			break;
		}
	}
	argc -= optind;
	argv += optind;

	if (argc == 0){
		printf("need # of pages\n");
		exit(1);
	}

	nr_pages = atoi(argv[0]);
	if (nr_pages < 2) {
		printf("nr_pages must >2\n");
		exit(1);
	}

	fd = hugetlbfs_unlinked_fd();
	p = mmap(NULL, nr_pages * gethugepagesize(),
		 PROT_READ|PROT_WRITE, mmap_flags, fd, 0);

	sleep(2);

	*(p + gethugepagesize()) = 1; /* COW */
	sleep(2);

	/* crash! */
	*(int*)0 = 1;

	return 0;
}

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c76afa26edf..e2159063198 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off)
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
 				   unsigned long mm_flags)
 {
+#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+
 	/* The vma can be set up to tell us the answer directly.  */
 	if (vma->vm_flags & VM_ALWAYSDUMP)
 		goto whole;
 
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+	}
+
 	/* Do not dump I/O mapped devices or special mappings */
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
-
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
 		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
-- 
cgit v1.2.3


From d1645e526a1e5842c9ac433d73419ba886676cf3 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sat, 18 Oct 2008 20:27:53 -0700
Subject: jbd: abort when failed to log metadata buffers

If we failed to write metadata buffers to the journal space and succeeded
to write the commit record, stale data can be written back to the
filesystem as metadata in the recovery phase.

To avoid this, when we failed to write out metadata buffers, abort the
journal before writing the commit record.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ae08c057e75..f1ea861b992 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -762,6 +762,9 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
+	if (err)
+		journal_abort(journal, err);
+
 	jbd_debug(3, "JBD: commit phase 6\n");
 
 	if (journal_write_commit_record(journal, commit_transaction))
-- 
cgit v1.2.3


From 885e353c7427db7b60692789741b34e605b0b69b Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sat, 18 Oct 2008 20:27:54 -0700
Subject: jbd: don't dirty original metadata buffer on abort

Currently, original metadata buffers are dirtied when they are unfiled
whether the journal has aborted or not.  Eventually these buffers will be
written-back to the filesystem by pdflush.  This means some metadata
buffers are written to the filesystem without journaling if the journal
aborts.  So if both journal abort and system crash happen at the same
time, the filesystem would become inconsistent state.  Additionally,
replaying journaled metadata can overwrite the latest metadata on the
filesystem partly.  Because, if the journal aborts, journaled metadata are
preserved and replayed during the next mount not to lose uncheckpointed
metadata.  This would also break the consistency of the filesystem.

This patch prevents original metadata buffers from being dirtied on abort
by clearing BH_JBDDirty flag from those buffers.  Thus, no metadata
buffers are written to the filesystem without journaling.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f1ea861b992..d6a6659f3e4 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -518,9 +518,10 @@ void journal_commit_transaction(journal_t *journal)
 		jh = commit_transaction->t_buffers;
 
 		/* If we're in abort mode, we just un-journal the buffer and
-		   release it for background writing. */
+		   release it. */
 
 		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
@@ -855,6 +856,8 @@ restart_loop:
 		if (buffer_jbddirty(bh)) {
 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 			__journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__journal_refile_buffer(jh);
 			jbd_unlock_bh_state(bh);
-- 
cgit v1.2.3


From 972fbf779832e5ad15effa7712789aeff9224c37 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Sat, 18 Oct 2008 20:27:55 -0700
Subject: ext3: don't try to resize if there are no reserved gdt blocks left

When trying to resize a ext3 fs and you run out of reserved gdt blocks,
you get an error that doesn't actually tell you what went wrong, it just
says that the gdb it picked is not correct, which is the case since you
don't have any reserved gdt blocks left.  This patch adds a check to make
sure you have reserved gdt blocks to use, and if not prints out a more
relevant error.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Andreas Dilger <adilger@sun.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 77278e947e9..78fdf383637 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 
 	if (reserved_gdb || gdb_off == 0) {
 		if (!EXT3_HAS_COMPAT_FEATURE(sb,
-					     EXT3_FEATURE_COMPAT_RESIZE_INODE)){
+					     EXT3_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext3_warning(sb, __func__,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
-- 
cgit v1.2.3


From 46d01a225e694f1a4343beea44f1e85105aedd7e Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sat, 18 Oct 2008 20:27:56 -0700
Subject: ext3: fix ext3 block reservation early ENOSPC issue

We could run into ENOSPC error on ext3, even when there is free blocks on
the filesystem.

The problem is triggered in the case the goal block group has 0 free
blocks , and the rest block groups are skipped due to the check of
"free_blocks < windowsz/2".  Current code could fall back to non
reservation allocation to prevent early ENOSPC after examing all the block
groups with reservation on , but this code was bypassed if the reservation
window is turned off already, which is true in this case.

This patch fixed two issues:
1) We don't need to turn off block reservation if the goal block group has
0 free blocks left and continue search for the rest of block groups.

Current code the intention is to turn off the block reservation if the
goal allocation group has a few (some) free blocks left (not enough for
make the desired reservation window),to try to allocation in the goal
block group, to get better locality.  But if the goal blocks have 0 free
blocks, it should leave the block reservation on, and continues search for
the next block groups,rather than turn off block reservation completely.

2) we don't need to check the window size if the block reservation is off.

The problem was originally found and fixed in ext4.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/balloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 92fd0338a6e..f5b57a2ca35 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1547,6 +1547,7 @@ retry_alloc:
 	 * turn off reservation for this allocation
 	 */
 	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
 		&& (rsv_is_empty(&my_rsv->rsv_window)))
 		my_rsv = NULL;
 
@@ -1585,7 +1586,7 @@ retry_alloc:
 		 * free blocks is less than half of the reservation
 		 * window size.
 		 */
-		if (free_blocks <= (windowsz/2))
+		if (my_rsv && (free_blocks <= (windowsz/2)))
 			continue;
 
 		brelse(bitmap_bh);
-- 
cgit v1.2.3


From 0e4fb5e283870757024294bc4567a7c59d936f0b Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sat, 18 Oct 2008 20:27:57 -0700
Subject: ext3: add an option to control error handling on file data

If the journal doesn't abort when it gets an IO error in file data blocks,
the file data corruption will spread silently.  Because most of
applications and commands do buffered writes without fsync(), they don't
notice the IO error.  It's scary for mission critical systems.  On the
other hand, if the journal aborts whenever it gets an IO error in file
data blocks, the system will easily become inoperable.  So this patch
introduces a filesystem option to determine whether it aborts the journal
or just call printk() when it gets an IO error in file data.

If you mount a ext3 fs with data_err=abort option, it aborts on file data
write error.  If you mount it with data_err=ignore, it doesn't abort, just
call printk().  data_err=ignore is the default.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 16 ++++++++++++++++
 fs/jbd/commit.c |  2 ++
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 399a96a6c55..3a260af5544 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext3_show_quota_options(seq, sb);
 
 	return 0;
@@ -754,6 +757,7 @@ enum {
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -796,6 +800,8 @@ static const match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JFS_BARRIER;
 	else
 		journal->j_flags &= ~JFS_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index d6a6659f3e4..25719d902c5 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal)
 		printk(KERN_WARNING
 			"JBD: Detected IO errors while flushing file data "
 			"on %s\n", bdevname(journal->j_fs_dev, b));
+		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+			journal_abort(journal, err);
 		err = 0;
 	}
 
-- 
cgit v1.2.3


From 960a22ae60c8a723bd17da3b929fe0bcea6d007e Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sat, 18 Oct 2008 20:27:58 -0700
Subject: jbd: ordered data integrity fix

In ordered mode, if a file data buffer being dirtied exists in the
committing transaction, we write the buffer to the disk, move it from the
committing transaction to the running transaction, then dirty it.  But we
don't have to remove the buffer from the committing transaction when the
buffer couldn't be written out, otherwise it would miss the error and the
committing transaction would not abort.

This patch adds an error check before removing the buffer from the
committing transaction.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 0540ca27a44..d15cd6e7251 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 	journal_t *journal = handle->h_transaction->t_journal;
 	int need_brelse = 0;
 	struct journal_head *jh;
+	int ret = 0;
 
 	if (is_handle_aborted(handle))
-		return 0;
+		return ret;
 
 	jh = journal_add_journal_head(bh);
 	JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 				   time if it is redirtied */
 			}
 
-			/* journal_clean_data_list() may have got there first */
+			/*
+			 * We cannot remove the buffer with io error from the
+			 * committing transaction, because otherwise it would
+			 * miss the error and the commit would not abort.
+			 */
+			if (unlikely(!buffer_uptodate(bh))) {
+				ret = -EIO;
+				goto no_journal;
+			}
+
 			if (jh->b_transaction != NULL) {
 				JBUFFER_TRACE(jh, "unfile from commit");
 				__journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
 	}
 	JBUFFER_TRACE(jh, "exit");
 	journal_put_journal_head(jh);
-	return 0;
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From 6a897cf447a83c9c3fd1b85a1e525c02d6eada7d Mon Sep 17 00:00:00 2001
From: Eugene Dashevsky <eugene@ibrix.com>
Date: Sat, 18 Oct 2008 20:27:59 -0700
Subject: ext3: fix ext3_dx_readdir hash collision handling

This fixes a bug where readdir() would return a directory entry twice
if there was a hash collision in an hash tree indexed directory.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Eugene Dashevsky <eugene@ibrix.com>
Signed-off-by: Mike Snitzer <msnitzer@ibrix.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 2eea96ec78e..28b681ef47e 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -410,7 +410,7 @@ static int call_filldir(struct file * filp, void * dirent,
 				get_dtype(sb, fname->file_type));
 		if (error) {
 			filp->f_pos = curr_pos;
-			info->extra_fname = fname->next;
+			info->extra_fname = fname;
 			return error;
 		}
 		fname = fname->next;
@@ -449,11 +449,21 @@ static int ext3_dx_readdir(struct file * filp,
 	 * If there are any leftover names on the hash collision
 	 * chain, return them first.
 	 */
-	if (info->extra_fname &&
-	    call_filldir(filp, dirent, filldir, info->extra_fname))
-		goto finished;
+	if (info->extra_fname) {
+		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+			goto finished;
 
-	if (!info->curr_node)
+		info->extra_fname = NULL;
+		info->curr_node = rb_next(info->curr_node);
+		if (!info->curr_node) {
+			if (info->next_hash == ~0) {
+				filp->f_pos = EXT3_HTREE_EOF;
+				goto finished;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
 	while (1) {
-- 
cgit v1.2.3


From 5ec8b75e3a2a94860ee99b5456fe1a963c8680e5 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 18 Oct 2008 20:28:00 -0700
Subject: ext3: truncate block allocated on a failed ext3_write_begin

For blocksize < pagesize we need to remove blocks that got allocated in
block_write_begin() if we fail with ENOSPC for later blocks.
block_write_begin() internally does this if it allocated page locally.
This makes sure we don't have blocks outside inode.i_size during ENOSPC.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ebfec4d0148..f8424ad8997 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1186,6 +1186,13 @@ write_begin_failed:
 		ext3_journal_stop(handle);
 		unlock_page(page);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-- 
cgit v1.2.3


From cdbf6dba28e8e6268c8420857696309470009fd9 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 18 Oct 2008 20:28:00 -0700
Subject: ext3: avoid printk floods in the face of directory corruption

A very large directory with many read failures (either due to storage
problems, or due to invalid size & blocks from corruption) will generate a
printk storm as the filesystem continues to try to read all the blocks.
This flood of messages can tie up the box until it is complete - which may
be a very long time, especially for very large corrupted values.

This is fixed by only reporting the corruption once each time we try to
read the directory.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eugene Teo <eugeneteo@kernel.sg>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 28b681ef47e..4c82531ea0a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp,
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	int dir_has_error = 0;
 
 	sb = inode->i_sb;
 
@@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext3_error (sb, "ext3_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				ext3_error(sb, __func__, "directory #%lu "
+					"contains a hole at offset %lld",
+					inode->i_ino, filp->f_pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (filp->f_pos > inode->i_blocks << 9)
 				break;
-- 
cgit v1.2.3


From 6e7152944426be786c6c232990914e4565290d35 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Sat, 18 Oct 2008 20:28:01 -0700
Subject: hfsplus: missing O_LARGEFILE check

hfsplus: O_LARGEFILE checking is missing

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=8490

From: Alan Cox <alan@redhat.com>
Reported-by: didier <did447@gmail.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b085d64a2b6..963be644297 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -254,6 +254,8 @@ static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode).rsrc_inode;
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
 	atomic_inc(&HFSPLUS_I(inode).opencnt);
 	return 0;
 }
-- 
cgit v1.2.3


From 248736c2a57206388c86f8cdd3392ee986e84f9f Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sat, 18 Oct 2008 20:28:02 -0700
Subject: hfsplus: fix possible deadlock when handling corrupted extents

A corrupted extent for the extent file itself may try to get an impossible
extent, causing a deadlock if I see it correctly.

Check the inode number after the first_blocks checks and fail if it's the
extent file, as according to the spec the extent file should have no
extent for itself.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/extents.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fec8f61227f..0022eec63cd 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		goto done;
 	}
 
+	if (inode->i_ino == HFSPLUS_EXT_CNID)
+		return -EIO;
+
 	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
-- 
cgit v1.2.3


From 85dd030edb174376ef43bc95e5fae4755af1ec98 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:18 -0700
Subject: seq_file: don't call bitmap_scnprintf_len()

"m->count + len < m->size" is true commonly, so bitmap_scnprintf()
is commonly called. this fix saves a call to bitmap_scnprintf_len().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index bd20f7f5a93..11c85fec6b4 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -452,17 +452,18 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 
 int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
 {
-	size_t len = bitmap_scnprintf_len(nr_bits);
-
-	if (m->count + len < m->size) {
-		bitmap_scnprintf(m->buf + m->count, m->size - m->count,
-				 bits, nr_bits);
-		m->count += len;
-		return 0;
+	if (m->count < m->size) {
+		int len = bitmap_scnprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
 	}
 	m->count = m->size;
 	return -1;
 }
+EXPORT_SYMBOL(seq_bitmap);
 
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
-- 
cgit v1.2.3


From 3eda20118000941e7e8994fc5fac8706d8c10f00 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:19 -0700
Subject: seq_file: add seq_cpumask_list(), seq_nodemask_list()

seq_cpumask_list(), seq_nodemask_list() are very like seq_cpumask(),
seq_nodemask(), but they print human readable string.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 11c85fec6b4..eba2eabcd2b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -465,6 +465,22 @@ int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
 }
 EXPORT_SYMBOL(seq_bitmap);
 
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnlistprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap_list);
+
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
 	return NULL + (*pos == 0);
-- 
cgit v1.2.3


From 6409324b385f3f63a03645b4422e3be67348d922 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 18 Oct 2008 20:28:22 -0700
Subject: coredump: format_corename: don't append .%pid if multi-threaded

If the coredumping is multi-threaded, format_corename() appends .%pid to
the corename.  This was needed before the proper multi-thread core dump
support, now all the threads in the mm go into a single unified core file.

Remove this special case, it is not even documented and we have "%p"
and core_uses_pid.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: La Monte Yarroll <piggy@laurelnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index a41e7902ed0..4e834f16d9d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1386,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt);
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(char *corename, int nr_threads, long signr)
+static int format_corename(char *corename, long signr)
 {
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
@@ -1493,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr)
 	 * If core_pattern does not include a %p (as is the default)
 	 * and core_uses_pid is set, then .%pid will be appended to
 	 * the filename. Do not do this for piped commands. */
-	if (!ispipe && !pid_in_pattern
-	    && (core_uses_pid || nr_threads)) {
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
 		rc = snprintf(out_ptr, out_end - out_ptr,
 			      ".%d", task_tgid_vnr(current));
 		if (rc > out_end - out_ptr)
@@ -1757,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * uses lock_kernel()
 	 */
  	lock_kernel();
-	ispipe = format_corename(corename, retval, signr);
+	ispipe = format_corename(corename, signr);
 	unlock_kernel();
 	/*
 	 * Don't bother to check the RLIMIT_CORE value if core_pattern points
-- 
cgit v1.2.3


From 656eb2cd5da153762f2e8419ca117ce12ef522c3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sat, 18 Oct 2008 20:28:23 -0700
Subject: add CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS

This adds a kconfig option to change the /proc/PID/coredump_filter default.
Fedora has been carrying a trivial patch to change the hard-wired value for
this default, since Fedora 8.  The default default can't change safely
because there are old GDB versions out there (all before 6.7) that are
confused by the core dump files created by the MMF_DUMP_ELF_HEADERS setting.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig.binfmt | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 801db134181..ce9fb3fbfae 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC
 
 	  It is also possible to run FDPIC ELF binaries on MMU linux also.
 
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+	bool "Write ELF core dumps with partial segments"
+	default n
+	depends on BINFMT_ELF
+	help
+	  ELF core dump files describe each memory mapping of the crashed
+	  process, and can contain or omit the memory contents of each one.
+	  The contents of an unmodified text mapping are omitted by default.
+
+	  For an unmodified text mapping of an ELF object, including just
+	  the first page of the file in a core dump makes it possible to
+	  identify the build ID bits in the file, without paying the i/o
+	  cost and disk space to dump all the text.  However, versions of
+	  GDB before 6.7 are confused by ELF core dump files in this format.
+
+	  The core dump behavior can be controlled per process using
+	  the /proc/PID/coredump_filter pseudo-file; this setting is
+	  inherited.  See Documentation/filesystems/proc.txt for details.
+
+	  This config option changes the default setting of coredump_filter
+	  seen at boot time.  If unsure, say N.
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU && (!FRV || BROKEN)
-- 
cgit v1.2.3


From 57cac4d1880527e0baf6c2fda529d2ad1d815aec Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Sat, 18 Oct 2008 20:28:25 -0700
Subject: kdump: make elfcorehdr_addr independent of CONFIG_PROC_VMCORE

o elfcorehdr_addr is used by not only the code under CONFIG_PROC_VMCORE
  but also by the code which is not inside CONFIG_PROC_VMCORE.  For
  example, is_kdump_kernel() is used by powerpc code to determine if
  kernel is booting after a panic then use previous kernel's TCE table.
  So even if CONFIG_PROC_VMCORE is not set in second kernel, one should be
  able to correctly determine that we are booting after a panic and setup
  calgary iommu accordingly.

o So remove the assumption that elfcorehdr_addr is under
  CONFIG_PROC_VMCORE.

o Move definition of elfcorehdr_addr to arch dependent crash files.
  (Unfortunately crash dump does not have an arch independent file
  otherwise that would have been the best place).

o kexec.c is not the right place as one can Have CRASH_DUMP enabled in
  second kernel without KEXEC being enabled.

o I don't see sh setup code parsing the command line for
  elfcorehdr_addr.  I am wondering how does vmcore interface work on sh.
  Anyway, I am atleast defining elfcoredhr_addr so that compilation is not
  broken on sh.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Simon Horman <horms@verge.net.au>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 841368b87a2..4c65ca432d3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,9 +32,6 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
-- 
cgit v1.2.3


From 85a0ee342e0c06c19d78fdf48307211c6cf18fcb Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sat, 18 Oct 2008 20:28:29 -0700
Subject: kdump: add is_vmcore_usable() and vmcore_unusable()

The usage of elfcorehdr_addr has changed recently such that being set to
ELFCORE_ADDR_MAX is used by is_kdump_kernel() to indicate if the code is
executing in a kernel executed as a crash kernel.

However, arch/ia64/kernel/setup.c:reserve_elfcorehdr will rest
elfcorehdr_addr to ELFCORE_ADDR_MAX on error, which means any subsequent
calls to is_kdump_kernel() will return 0, even though they should return
1.

Ok, at this point in time there are no subsequent calls, but I think its
fair to say that there is ample scope for error or at the very least
confusion.

This patch add an extra state, ELFCORE_ADDR_ERR, which indicates that
elfcorehdr_addr was passed on the command line, and thus execution is
taking place in a crashdump kernel, but vmcore can't be used for some
reason.  This is tested for using is_vmcore_usable() and set using
vmcore_unusable().  A subsequent patch makes use of this new code.

To summarise, the states that elfcorehdr_addr can now be in are as follows:

ELFCORE_ADDR_MAX: not a crashdump kernel
ELFCORE_ADDR_ERR: crashdump kernel but vmcore is unusable
any other value:  crash dump kernel and vmcore is usable

Signed-off-by: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4c65ca432d3..cd9ca67f841 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -644,7 +644,7 @@ static int __init vmcore_init(void)
 	int rc = 0;
 
 	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
-	if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+	if (!(is_vmcore_usable()))
 		return rc;
 	rc = parse_crash_elf_headers();
 	if (rc) {
-- 
cgit v1.2.3


From bb26b963d8343bb1bde842fba0b6e00cad841f31 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 18 Oct 2008 20:28:49 -0700
Subject: fs/Kconfig: move CIFS out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig      | 143 +-------------------------------------------------------
 fs/cifs/Kconfig | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 142 deletions(-)
 create mode 100644 fs/cifs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d0a1174fb51..c189089f35a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1913,148 +1913,7 @@ config SMB_NLS_REMOTE
 
 	  smbmount from samba 2.2.0 or later supports this.
 
-config CIFS
-	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
-	depends on INET
-	select NLS
-	help
-	  This is the client VFS module for the Common Internet File System
-	  (CIFS) protocol which is the successor to the Server Message Block 
-	  (SMB) protocol, the native file sharing mechanism for most early
-	  PC operating systems.  The CIFS protocol is fully supported by 
-	  file servers such as Windows 2000 (including Windows 2003, NT 4  
-	  and Windows XP) as well by Samba (which provides excellent CIFS
-	  server support for Linux and many other operating systems). Limited
-	  support for OS/2 and Windows ME and similar servers is provided as
-	  well.
-
-	  The cifs module provides an advanced network file system
-	  client for mounting to CIFS compliant servers.  It includes
-	  support for DFS (hierarchical name space), secure per-user
-	  session establishment via Kerberos or NTLM or NTLMv2,
-	  safe distributed caching (oplock), optional packet
-	  signing, Unicode and other internationalization improvements.
-	  If you need to mount to Samba or Windows from this machine, say Y.
-
-config CIFS_STATS
-        bool "CIFS statistics"
-        depends on CIFS
-        help
-          Enabling this option will cause statistics for each server share
-	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
-
-config CIFS_STATS2
-	bool "Extended statistics"
-	depends on CIFS_STATS
-	help
-	  Enabling this option will allow more detailed statistics on SMB
-	  request timing to be displayed in /proc/fs/cifs/DebugData and also
-	  allow optional logging of slow responses to dmesg (depending on the
-	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
-	  These additional statistics may have a minor effect on performance
-	  and memory utilization.
-
-	  Unless you are a developer or are doing network performance analysis
-	  or tuning, say N.
-
-config CIFS_WEAK_PW_HASH
-	bool "Support legacy servers which use weaker LANMAN security"
-	depends on CIFS
-	help
-	  Modern CIFS servers including Samba and most Windows versions
-	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
-	  security mechanisms. These hash the password more securely
-	  than the mechanisms used in the older LANMAN version of the
-	  SMB protocol but LANMAN based authentication is needed to
-	  establish sessions with some old SMB servers.
-
-	  Enabling this option allows the cifs module to mount to older
-	  LANMAN based servers such as OS/2 and Windows 95, but such
-	  mounts may be less secure than mounts using NTLM or more recent
-	  security mechanisms if you are on a public network.  Unless you
-	  have a need to access old SMB servers (and are on a private
-	  network) you probably want to say N.  Even if this support
-	  is enabled in the kernel build, LANMAN authentication will not be
-	  used automatically. At runtime LANMAN mounts are disabled but
-	  can be set to required (or optional) either in
-	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
-	  option on the mount command. This support is disabled by
-	  default in order to reduce the possibility of a downgrade
-	  attack.
-
-	  If unsure, say N.
-
-config CIFS_UPCALL
-	  bool "Kerberos/SPNEGO advanced session setup"
-	  depends on CIFS && KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which accesses
-	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-	    Kerberos tickets which are needed to mount to certain secure servers
-	    (for which more secure Kerberos authentication is required). If
-	    unsure, say N.
-
-config CIFS_XATTR
-        bool "CIFS extended attributes"
-        depends on CIFS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).  CIFS maps the name of
-          extended attributes beginning with the user namespace prefix
-          to SMB/CIFS EAs. EAs are stored on Windows servers without the
-          user namespace prefix, but their names are seen by Linux cifs clients
-          prefaced by the user namespace prefix. The system namespace
-          (used by some filesystems to store ACLs) is not supported at
-          this time.
-
-          If unsure, say N.
-
-config CIFS_POSIX
-        bool "CIFS POSIX Extensions"
-        depends on CIFS_XATTR
-        help
-          Enabling this option will cause the cifs client to attempt to
-	  negotiate a newer dialect with servers, such as Samba 3.0.5
-	  or later, that optionally can handle more POSIX like (rather
-	  than Windows like) file behavior.  It also enables
-	  support for POSIX ACLs (getfacl and setfacl) to servers
-	  (such as Samba 3.10 and later) which can negotiate
-	  CIFS POSIX ACL support.  If unsure, say N.
-
-config CIFS_DEBUG2
-	bool "Enable additional CIFS debugging routines"
-	depends on CIFS
-	help
-	   Enabling this option adds a few more debugging routines
-	   to the cifs code which slightly increases the size of
-	   the cifs module and can cause additional logging of debug
-	   messages in some error paths, slowing performance. This
-	   option can be turned off unless you are debugging
-	   cifs problems.  If unsure, say N.
-
-config CIFS_EXPERIMENTAL
-	  bool "CIFS Experimental Features (EXPERIMENTAL)"
-	  depends on CIFS && EXPERIMENTAL
-	  help
-	    Enables cifs features under testing. These features are
-	    experimental and currently include DFS support and directory 
-	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
-	    mechanism which will be used for Kerberos session negotiation
-	    and uid remapping.  Some of these features also may depend on 
-	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
-	    (which is disabled by default). See the file fs/cifs/README 
-	    for more details.  If unsure, say N.
-
-config CIFS_DFS_UPCALL
-	  bool "DFS feature support (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which contacts userspace
-	    helper utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say N.
+source "fs/cifs/Kconfig"
 
 config NCP_FS
 	tristate "NCP file system support (to mount NetWare volumes)"
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
new file mode 100644
index 00000000000..341a98965bd
--- /dev/null
+++ b/fs/cifs/Kconfig
@@ -0,0 +1,142 @@
+config CIFS
+	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+	depends on INET
+	select NLS
+	help
+	  This is the client VFS module for the Common Internet File System
+	  (CIFS) protocol which is the successor to the Server Message Block
+	  (SMB) protocol, the native file sharing mechanism for most early
+	  PC operating systems.  The CIFS protocol is fully supported by
+	  file servers such as Windows 2000 (including Windows 2003, NT 4
+	  and Windows XP) as well by Samba (which provides excellent CIFS
+	  server support for Linux and many other operating systems). Limited
+	  support for OS/2 and Windows ME and similar servers is provided as
+	  well.
+
+	  The cifs module provides an advanced network file system
+	  client for mounting to CIFS compliant servers.  It includes
+	  support for DFS (hierarchical name space), secure per-user
+	  session establishment via Kerberos or NTLM or NTLMv2,
+	  safe distributed caching (oplock), optional packet
+	  signing, Unicode and other internationalization improvements.
+	  If you need to mount to Samba or Windows from this machine, say Y.
+
+config CIFS_STATS
+        bool "CIFS statistics"
+        depends on CIFS
+        help
+          Enabling this option will cause statistics for each server share
+	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
+
+config CIFS_STATS2
+	bool "Extended statistics"
+	depends on CIFS_STATS
+	help
+	  Enabling this option will allow more detailed statistics on SMB
+	  request timing to be displayed in /proc/fs/cifs/DebugData and also
+	  allow optional logging of slow responses to dmesg (depending on the
+	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
+	  These additional statistics may have a minor effect on performance
+	  and memory utilization.
+
+	  Unless you are a developer or are doing network performance analysis
+	  or tuning, say N.
+
+config CIFS_WEAK_PW_HASH
+	bool "Support legacy servers which use weaker LANMAN security"
+	depends on CIFS
+	help
+	  Modern CIFS servers including Samba and most Windows versions
+	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+	  security mechanisms. These hash the password more securely
+	  than the mechanisms used in the older LANMAN version of the
+	  SMB protocol but LANMAN based authentication is needed to
+	  establish sessions with some old SMB servers.
+
+	  Enabling this option allows the cifs module to mount to older
+	  LANMAN based servers such as OS/2 and Windows 95, but such
+	  mounts may be less secure than mounts using NTLM or more recent
+	  security mechanisms if you are on a public network.  Unless you
+	  have a need to access old SMB servers (and are on a private
+	  network) you probably want to say N.  Even if this support
+	  is enabled in the kernel build, LANMAN authentication will not be
+	  used automatically. At runtime LANMAN mounts are disabled but
+	  can be set to required (or optional) either in
+	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+	  option on the mount command. This support is disabled by
+	  default in order to reduce the possibility of a downgrade
+	  attack.
+
+	  If unsure, say N.
+
+config CIFS_UPCALL
+	  bool "Kerberos/SPNEGO advanced session setup"
+	  depends on CIFS && KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which accesses
+	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+	    Kerberos tickets which are needed to mount to certain secure servers
+	    (for which more secure Kerberos authentication is required). If
+	    unsure, say N.
+
+config CIFS_XATTR
+        bool "CIFS extended attributes"
+        depends on CIFS
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).  CIFS maps the name of
+          extended attributes beginning with the user namespace prefix
+          to SMB/CIFS EAs. EAs are stored on Windows servers without the
+          user namespace prefix, but their names are seen by Linux cifs clients
+          prefaced by the user namespace prefix. The system namespace
+          (used by some filesystems to store ACLs) is not supported at
+          this time.
+
+          If unsure, say N.
+
+config CIFS_POSIX
+        bool "CIFS POSIX Extensions"
+        depends on CIFS_XATTR
+        help
+          Enabling this option will cause the cifs client to attempt to
+	  negotiate a newer dialect with servers, such as Samba 3.0.5
+	  or later, that optionally can handle more POSIX like (rather
+	  than Windows like) file behavior.  It also enables
+	  support for POSIX ACLs (getfacl and setfacl) to servers
+	  (such as Samba 3.10 and later) which can negotiate
+	  CIFS POSIX ACL support.  If unsure, say N.
+
+config CIFS_DEBUG2
+	bool "Enable additional CIFS debugging routines"
+	depends on CIFS
+	help
+	   Enabling this option adds a few more debugging routines
+	   to the cifs code which slightly increases the size of
+	   the cifs module and can cause additional logging of debug
+	   messages in some error paths, slowing performance. This
+	   option can be turned off unless you are debugging
+	   cifs problems.  If unsure, say N.
+
+config CIFS_EXPERIMENTAL
+	  bool "CIFS Experimental Features (EXPERIMENTAL)"
+	  depends on CIFS && EXPERIMENTAL
+	  help
+	    Enables cifs features under testing. These features are
+	    experimental and currently include DFS support and directory
+	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
+	    mechanism which will be used for Kerberos session negotiation
+	    and uid remapping.  Some of these features also may depend on
+	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
+	    (which is disabled by default). See the file fs/cifs/README
+	    for more details.  If unsure, say N.
+
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support (EXPERIMENTAL)"
+	  depends on CIFS_EXPERIMENTAL
+	  depends on KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which contacts userspace
+	    helper utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
-- 
cgit v1.2.3


From 413460980ea7796582bce672be85879be0865ada Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 20 Oct 2008 18:24:42 +0000
Subject: [CIFS] fix build error

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 232ab16d7fd..23cca214ede 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -813,7 +813,7 @@ cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
 					current->tgid);
 		/* although we would like to mark the file hidden
  		   if that fails we will still try to rename it */
-		if (rc != 0) {
+		if (rc != 0)
 			cifsInode->cifsAttrs = dosattr;
 		else
 			dosattr = origattr; /* since not able to change them */
-- 
cgit v1.2.3


From 6da0b38f4433fb0f24615449d7966471b6e5eae0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 20 Oct 2008 22:28:45 +0400
Subject: fs/Kconfig: move ext2, ext3, ext4, JBD, JBD2 out

Use fs/*/Kconfig more, which is good because everything related to one
filesystem is in one place and fs/Kconfig is quite fat.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig      | 272 ++------------------------------------------------------
 fs/ext2/Kconfig |  55 ++++++++++++
 fs/ext3/Kconfig |  67 ++++++++++++++
 fs/ext4/Kconfig |  79 ++++++++++++++++
 fs/jbd/Kconfig  |  30 +++++++
 fs/jbd2/Kconfig |  33 +++++++
 6 files changed, 269 insertions(+), 267 deletions(-)
 create mode 100644 fs/ext2/Kconfig
 create mode 100644 fs/ext3/Kconfig
 create mode 100644 fs/ext4/Kconfig
 create mode 100644 fs/jbd/Kconfig
 create mode 100644 fs/jbd2/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 4eca61c201f..e282002b94d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -6,61 +6,9 @@ menu "File systems"
 
 if BLOCK
 
-config EXT2_FS
-	tristate "Second extended fs support"
-	help
-	  Ext2 is a standard Linux file system for hard disks.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ext2.
-
-	  If unsure, say Y.
-
-config EXT2_FS_XATTR
-	bool "Ext2 extended attributes"
-	depends on EXT2_FS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config EXT2_FS_POSIX_ACL
-	bool "Ext2 POSIX Access Control Lists"
-	depends on EXT2_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT2_FS_SECURITY
-	bool "Ext2 Security Labels"
-	depends on EXT2_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext2 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
-	bool "Ext2 execute in place support"
-	depends on EXT2_FS && MMU
-	help
-	  Execute in place can be used on memory-backed block devices. If you
-	  enable this option, you can select to mount block devices which are
-	  capable of this feature without using the page cache.
-
-	  If you do not use a block device that is capable of using this,
-	  or if unsure, say N.
+source "fs/ext2/Kconfig"
+source "fs/ext3/Kconfig"
+source "fs/ext4/Kconfig"
 
 config FS_XIP
 # execute in place
@@ -68,218 +16,8 @@ config FS_XIP
 	depends on EXT2_FS_XIP
 	default y
 
-config EXT3_FS
-	tristate "Ext3 journalling file system support"
-	select JBD
-	help
-	  This is the journalling version of the Second extended file system
-	  (often called ext3), the de facto standard Linux file system
-	  (method to organize files on a storage device) for hard disks.
-
-	  The journalling code included in this driver means you do not have
-	  to run e2fsck (file system checker) on your file systems after a
-	  crash.  The journal keeps track of any changes that were being made
-	  at the time the system crashed, and can ensure that your file system
-	  is consistent without the need for a lengthy check.
-
-	  Other than adding the journal to the file system, the on-disk format
-	  of ext3 is identical to ext2.  It is possible to freely switch
-	  between using the ext3 driver and the ext2 driver, as long as the
-	  file system has been cleanly unmounted, or e2fsck is run on the file
-	  system.
-
-	  To add a journal on an existing ext2 file system or change the
-	  behavior of ext3 file systems, you can use the tune2fs utility ("man
-	  tune2fs").  To modify attributes of files and directories on ext3
-	  file systems, use chattr ("man chattr").  You need to be using
-	  e2fsprogs version 1.20 or later in order to create ext3 journals
-	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ext3.
-
-config EXT3_FS_XATTR
-	bool "Ext3 extended attributes"
-	depends on EXT3_FS
-	default y
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-	  You need this for POSIX ACL support on ext3.
-
-config EXT3_FS_POSIX_ACL
-	bool "Ext3 POSIX Access Control Lists"
-	depends on EXT3_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT3_FS_SECURITY
-	bool "Ext3 Security Labels"
-	depends on EXT3_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext3 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config EXT4_FS
-	tristate "The Extended 4 (ext4) filesystem"
-	select JBD2
-	select CRC16
-	help
-	  This is the next generation of the ext3 filesystem.
-
-	  Unlike the change from ext2 filesystem to ext3 filesystem,
-	  the on-disk format of ext4 is not forwards compatible with
-	  ext3; it is based on extent maps and it supports 48-bit
-	  physical block numbers.  The ext4 filesystem also supports delayed
-	  allocation, persistent preallocation, high resolution time stamps,
-	  and a number of other features to improve performance and speed
-	  up fsck time.  For more information, please see the web pages at
-	  http://ext4.wiki.kernel.org.
-
-	  The ext4 filesystem will support mounting an ext3
-	  filesystem; while there will be some performance gains from
-	  the delayed allocation and inode table readahead, the best
-	  performance gains will require enabling ext4 features in the
-	  filesystem, or formating a new filesystem as an ext4
-	  filesystem initially.
-
-	  To compile this file system support as a module, choose M here. The
-	  module will be called ext4.
-
-	  If unsure, say N.
-
-config EXT4DEV_COMPAT
-	bool "Enable ext4dev compatibility"
-	depends on EXT4_FS
-	help
-	  Starting with 2.6.28, the name of the ext4 filesystem was
-	  renamed from ext4dev to ext4.  Unfortunately there are some
-	  legacy userspace programs (such as klibc's fstype) have
-	  "ext4dev" hardcoded.
-
-	  To enable backwards compatibility so that systems that are
-	  still expecting to mount ext4 filesystems using ext4dev,
-	  chose Y here.   This feature will go away by 2.6.31, so
-	  please arrange to get your userspace programs fixed!
-
-config EXT4_FS_XATTR
-	bool "Ext4 extended attributes"
-	depends on EXT4_FS
-	default y
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-	  You need this for POSIX ACL support on ext4.
-
-config EXT4_FS_POSIX_ACL
-	bool "Ext4 POSIX Access Control Lists"
-	depends on EXT4_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  POSIX Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the POSIX ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT4_FS_SECURITY
-	bool "Ext4 Security Labels"
-	depends on EXT4_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext4 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JBD
-	tristate
-	help
-	  This is a generic journalling layer for block devices.  It is
-	  currently used by the ext3 file system, but it could also be
-	  used to add journal support to other file systems or block
-	  devices such as RAID or LVM.
-
-	  If you are using the ext3 file system, you need to say Y here.
-	  If you are not using ext3 then you will probably want to say N.
-
-	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 into the kernel, you
-	  cannot compile this code as a module.
-
-config JBD_DEBUG
-	bool "JBD (ext3) debugging support"
-	depends on JBD && DEBUG_FS
-	help
-	  If you are using the ext3 journaled file system (or potentially any
-	  other file system/device using JBD), this option allows you to
-	  enable debugging output while the system is running, in order to
-	  help track down any problems you are having.  By default the
-	  debugging output will be turned off.
-
-	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
-	  number between 1 and 5, the higher the number, the more debugging
-	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
-
-config JBD2
-	tristate
-	select CRC32
-	help
-	  This is a generic journaling layer for block devices that support
-	  both 32-bit and 64-bit block numbers.  It is currently used by
-	  the ext4 and OCFS2 filesystems, but it could also be used to add
-	  journal support to other file systems or block devices such
-	  as RAID or LVM.
-
-	  If you are using ext4 or OCFS2, you need to say Y here.
-	  If you are not using ext4 or OCFS2 then you will
-	  probably want to say N.
-
-	  To compile this device as a module, choose M here. The module will be
-	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
-	  you cannot compile this code as a module.
-
-config JBD2_DEBUG
-	bool "JBD2 (ext4) debugging support"
-	depends on JBD2 && DEBUG_FS
-	help
-	  If you are using the ext4 journaled file system (or
-	  potentially any other filesystem/device using JBD2), this option
-	  allows you to enable debugging output while the system is running,
-	  in order to help track down any problems you are having.
-	  By default, the debugging output will be turned off.
-
-	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
-	  number between 1 and 5. The higher the number, the more debugging
-	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+source "fs/jbd/Kconfig"
+source "fs/jbd2/Kconfig"
 
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
new file mode 100644
index 00000000000..14a6780fd03
--- /dev/null
+++ b/fs/ext2/Kconfig
@@ -0,0 +1,55 @@
+config EXT2_FS
+	tristate "Second extended fs support"
+	help
+	  Ext2 is a standard Linux file system for hard disks.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext2.
+
+	  If unsure, say Y.
+
+config EXT2_FS_XATTR
+	bool "Ext2 extended attributes"
+	depends on EXT2_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config EXT2_FS_POSIX_ACL
+	bool "Ext2 POSIX Access Control Lists"
+	depends on EXT2_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT2_FS_SECURITY
+	bool "Ext2 Security Labels"
+	depends on EXT2_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config EXT2_FS_XIP
+	bool "Ext2 execute in place support"
+	depends on EXT2_FS && MMU
+	help
+	  Execute in place can be used on memory-backed block devices. If you
+	  enable this option, you can select to mount block devices which are
+	  capable of this feature without using the page cache.
+
+	  If you do not use a block device that is capable of using this,
+	  or if unsure, say N.
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
new file mode 100644
index 00000000000..8e0cfe44b0f
--- /dev/null
+++ b/fs/ext3/Kconfig
@@ -0,0 +1,67 @@
+config EXT3_FS
+	tristate "Ext3 journalling file system support"
+	select JBD
+	help
+	  This is the journalling version of the Second extended file system
+	  (often called ext3), the de facto standard Linux file system
+	  (method to organize files on a storage device) for hard disks.
+
+	  The journalling code included in this driver means you do not have
+	  to run e2fsck (file system checker) on your file systems after a
+	  crash.  The journal keeps track of any changes that were being made
+	  at the time the system crashed, and can ensure that your file system
+	  is consistent without the need for a lengthy check.
+
+	  Other than adding the journal to the file system, the on-disk format
+	  of ext3 is identical to ext2.  It is possible to freely switch
+	  between using the ext3 driver and the ext2 driver, as long as the
+	  file system has been cleanly unmounted, or e2fsck is run on the file
+	  system.
+
+	  To add a journal on an existing ext2 file system or change the
+	  behavior of ext3 file systems, you can use the tune2fs utility ("man
+	  tune2fs").  To modify attributes of files and directories on ext3
+	  file systems, use chattr ("man chattr").  You need to be using
+	  e2fsprogs version 1.20 or later in order to create ext3 journals
+	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext3.
+
+config EXT3_FS_XATTR
+	bool "Ext3 extended attributes"
+	depends on EXT3_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+	bool "Ext3 POSIX Access Control Lists"
+	depends on EXT3_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+	bool "Ext3 Security Labels"
+	depends on EXT3_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext3 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 00000000000..7505482a08f
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,79 @@
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
+	select JBD2
+	select CRC16
+	help
+	  This is the next generation of the ext3 filesystem.
+
+	  Unlike the change from ext2 filesystem to ext3 filesystem,
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
+
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formating a new filesystem as an ext4
+	  filesystem initially.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called ext4.
+
+	  If unsure, say N.
+
+config EXT4DEV_COMPAT
+	bool "Enable ext4dev compatibility"
+	depends on EXT4_FS
+	help
+	  Starting with 2.6.28, the name of the ext4 filesystem was
+	  renamed from ext4dev to ext4.  Unfortunately there are some
+	  legacy userspace programs (such as klibc's fstype) have
+	  "ext4dev" hardcoded.
+
+	  To enable backwards compatibility so that systems that are
+	  still expecting to mount ext4 filesystems using ext4dev,
+	  chose Y here.   This feature will go away by 2.6.31, so
+	  please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+	bool "Ext4 extended attributes"
+	depends on EXT4_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext4 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 00000000000..4e28beeed15
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+	tristate
+	help
+	  This is a generic journalling layer for block devices.  It is
+	  currently used by the ext3 file system, but it could also be
+	  used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
+
+	  If you are using the ext3 file system, you need to say Y here.
+	  If you are not using ext3 then you will probably want to say N.
+
+	  To compile this device as a module, choose M here: the module will be
+	  called jbd.  If you are compiling ext3 into the kernel, you
+	  cannot compile this code as a module.
+
+config JBD_DEBUG
+	bool "JBD (ext3) debugging support"
+	depends on JBD && DEBUG_FS
+	help
+	  If you are using the ext3 journaled file system (or potentially any
+	  other file system/device using JBD), this option allows you to
+	  enable debugging output while the system is running, in order to
+	  help track down any problems you are having.  By default the
+	  debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+	  number between 1 and 5, the higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
new file mode 100644
index 00000000000..f32f346f4b0
--- /dev/null
+++ b/fs/jbd2/Kconfig
@@ -0,0 +1,33 @@
+config JBD2
+	tristate
+	select CRC32
+	help
+	  This is a generic journaling layer for block devices that support
+	  both 32-bit and 64-bit block numbers.  It is currently used by
+	  the ext4 and OCFS2 filesystems, but it could also be used to add
+	  journal support to other file systems or block devices such
+	  as RAID or LVM.
+
+	  If you are using ext4 or OCFS2, you need to say Y here.
+	  If you are not using ext4 or OCFS2 then you will
+	  probably want to say N.
+
+	  To compile this device as a module, choose M here. The module will be
+	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
+
+config JBD2_DEBUG
+	bool "JBD2 (ext4) debugging support"
+	depends on JBD2 && DEBUG_FS
+	help
+	  If you are using the ext4 journaled file system (or
+	  potentially any other filesystem/device using JBD2), this option
+	  allows you to enable debugging output while the system is running,
+	  in order to help track down any problems you are having.
+	  By default, the debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+	  number between 1 and 5. The higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
-- 
cgit v1.2.3


From 14121bdccc17b8c0e4368a9c0e4f82c3dd47f240 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Oct 2008 14:45:22 -0400
Subject: cifs: make cifs_rename handle -EACCES errors

cifs: make cifs_rename handle -EACCES errors

Some servers seem to return -EACCES when attempting to rename one
open file on top of another. Refactor the cifs_rename logic to
attempt to rename the target file out of the way in this situation.

This also fixes the "unlink_target" logic to be undoable if the
subsequent rename fails.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 174 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 122 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 23cca214ede..8ac4eabc1f8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1285,22 +1285,24 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 	return rc;
 }
 
-int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
-	struct inode *target_inode, struct dentry *target_direntry)
+int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
+	struct inode *target_dir, struct dentry *target_dentry)
 {
 	char *fromName = NULL;
 	char *toName = NULL;
 	struct cifs_sb_info *cifs_sb_source;
 	struct cifs_sb_info *cifs_sb_target;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
+	struct cifsInodeInfo *target_cinode;
 	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
 	FILE_UNIX_BASIC_INFO *info_buf_target;
-	int xid;
-	int rc;
+	__u16 dstfid;
+	int xid, rc, tmprc, oplock = 0;
+	bool delete_already_pending;
 
-	cifs_sb_target = CIFS_SB(target_inode->i_sb);
-	cifs_sb_source = CIFS_SB(source_inode->i_sb);
-	pTcon = cifs_sb_source->tcon;
+	cifs_sb_target = CIFS_SB(target_dir->i_sb);
+	cifs_sb_source = CIFS_SB(source_dir->i_sb);
+	tcon = cifs_sb_source->tcon;
 
 	xid = GetXid();
 
@@ -1308,7 +1310,7 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
 	 * BB: this might be allowed if same server, but different share.
 	 * Consider adding support for this
 	 */
-	if (pTcon != cifs_sb_target->tcon) {
+	if (tcon != cifs_sb_target->tcon) {
 		rc = -EXDEV;
 		goto cifs_rename_exit;
 	}
@@ -1317,65 +1319,133 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
 	 * we already have the rename sem so we do not need to
 	 * grab it again here to protect the path integrity
 	 */
-	fromName = build_path_from_dentry(source_direntry);
+	fromName = build_path_from_dentry(source_dentry);
 	if (fromName == NULL) {
 		rc = -ENOMEM;
 		goto cifs_rename_exit;
 	}
 
-	toName = build_path_from_dentry(target_direntry);
+	toName = build_path_from_dentry(target_dentry);
 	if (toName == NULL) {
 		rc = -ENOMEM;
 		goto cifs_rename_exit;
 	}
 
-	rc = cifs_do_rename(xid, source_direntry, fromName,
-			    target_direntry, toName);
+	rc = cifs_do_rename(xid, source_dentry, fromName,
+			    target_dentry, toName);
 
-	if (rc == -EEXIST) {
-		if (pTcon->unix_ext) {
-			/*
-			 * Are src and dst hardlinks of same inode? We can
-			 * only tell with unix extensions enabled
-			 */
-			info_buf_source =
-				kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
-						GFP_KERNEL);
-			if (info_buf_source == NULL)
-				goto unlink_target;
-
-			info_buf_target = info_buf_source + 1;
-			rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
-						info_buf_source,
-						cifs_sb_source->local_nls,
-						cifs_sb_source->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (rc != 0)
-				goto unlink_target;
-
-			rc = CIFSSMBUnixQPathInfo(xid, pTcon,
-						toName, info_buf_target,
-						cifs_sb_target->local_nls,
-						/* remap based on source sb */
-						cifs_sb_source->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc == -EEXIST && tcon->unix_ext) {
+		/*
+		 * Are src and dst hardlinks of same inode? We can
+		 * only tell with unix extensions enabled
+		 */
+		info_buf_source =
+			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
+					GFP_KERNEL);
+		if (info_buf_source == NULL) {
+			rc = -ENOMEM;
+			goto cifs_rename_exit;
+		}
+
+		info_buf_target = info_buf_source + 1;
+		rc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
+					info_buf_source,
+					cifs_sb_source->local_nls,
+					cifs_sb_source->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc != 0)
+			goto unlink_target;
 
-			if (rc == 0 && (info_buf_source->UniqueId ==
-					info_buf_target->UniqueId))
-				/* same file, POSIX says that this is a noop */
-				goto cifs_rename_exit;
-		} /* else ... BB we could add the same check for Windows by
+		rc = CIFSSMBUnixQPathInfo(xid, tcon,
+					toName, info_buf_target,
+					cifs_sb_target->local_nls,
+					/* remap based on source sb */
+					cifs_sb_source->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+		if (rc == 0 && (info_buf_source->UniqueId ==
+				info_buf_target->UniqueId))
+			/* same file, POSIX says that this is a noop */
+			goto cifs_rename_exit;
+
+		rc = -EEXIST;
+	} /* else ... BB we could add the same check for Windows by
 		     checking the UniqueId via FILE_INTERNAL_INFO */
+
+	if ((rc == -EACCES) || (rc == -EEXIST)) {
 unlink_target:
+		/* don't bother if this is a negative dentry */
+		if (!target_dentry->d_inode)
+			goto cifs_rename_exit;
+
+		target_cinode = CIFS_I(target_dentry->d_inode);
+
+		/* try to move the target out of the way */
+		tmprc = CIFSSMBOpen(xid, tcon, toName, FILE_OPEN, DELETE,
+				    CREATE_NOT_DIR, &dstfid, &oplock, NULL,
+				    cifs_sb_target->local_nls,
+				    cifs_sb_target->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (tmprc)
+			goto cifs_rename_exit;
+
+		/* rename the file to random name */
+		tmprc = CIFSSMBRenameOpenFile(xid, tcon, dstfid, NULL,
+					      cifs_sb_target->local_nls,
+					      cifs_sb_target->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+		if (tmprc)
+			goto close_target;
+
+		delete_already_pending = target_cinode->delete_pending;
+
+		if (!delete_already_pending) {
+			/* set delete on close */
+			tmprc = CIFSSMBSetFileDisposition(xid, tcon,
+							  true, dstfid,
+							  current->tgid);
+			/*
+			 * This hack is for broken samba servers, remove this
+			 * once more fixed ones are in the field.
+			 */
+			if (tmprc == -ENOENT)
+				delete_already_pending = false;
+			else if (tmprc)
+				goto undo_target_rename;
+
+			target_cinode->delete_pending = true;
+		}
+
+
+		rc = cifs_do_rename(xid, source_dentry, fromName,
+				    target_dentry, toName);
+
+		if (rc == 0)
+			goto close_target;
+
 		/*
-		 * we either can not tell the files are hardlinked (as with
-		 * Windows servers) or files are not hardlinked. Delete the
-		 * target manually before renaming to follow POSIX rather than
-		 * Windows semantics
+		 * after this point, we can't bother with error handling on
+		 * the undo's. This is best effort since we can't do anything
+		 * about failures here.
 		 */
-		cifs_unlink(target_inode, target_direntry);
-		rc = cifs_do_rename(xid, source_direntry, fromName,
-				    target_direntry, toName);
+		if (!delete_already_pending) {
+			tmprc = CIFSSMBSetFileDisposition(xid, tcon,
+							  false, dstfid,
+							  current->tgid);
+			if (tmprc == 0)
+				target_cinode->delete_pending = false;
+		}
+
+undo_target_rename:
+		/* rename failed: undo target rename */
+		CIFSSMBRenameOpenFile(xid, tcon, dstfid,
+				      target_dentry->d_name.name,
+				      cifs_sb_target->local_nls,
+				      cifs_sb_target->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+close_target:
+		CIFSSMBClose(xid, tcon, dstfid);
 	}
 
 cifs_rename_exit:
-- 
cgit v1.2.3


From 2515ddc6db8eb49a79f0fe5e67ff09ac7c81eab4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 21 Oct 2008 12:07:40 +0900
Subject: binfmt_elf_fdpic: Update for cputime changes.

Commit f06febc96ba8e0af80bcc3eaec0a109e88275fac ("timers: fix itimer/
many thread hang") introduced a new task_cputime interface and
subsequently only converted binfmt_elf over to it.  This results in the
build for binfmt_elf_fdpic blowing up given that p->signal->{u,s}time
have disappeared from underneath us.

Apply the same trivial fix from binfmt_elf to binfmt_elf_fdpic.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf_fdpic.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 0e8367c5462..5b5424cb339 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1390,20 +1390,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
 		/*
-		 * This is the record for the group leader.  Add in the
-		 * cumulative times of previous dead threads.  This total
-		 * won't include the time of each live thread whose state
-		 * is included in the core dump.  The final total reported
-		 * to our parent process when it calls wait4 will include
-		 * those sums as well as the little bit more time it takes
-		 * this and each other thread to finish dying after the
-		 * core dump synchronization phase.
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
 		 */
-		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
-				   &prstatus->pr_utime);
-		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
-				   &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
-- 
cgit v1.2.3


From aeb5d727062a0238a2f96c9c380fbd2be4640c6f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 2 Sep 2008 15:28:45 -0400
Subject: [PATCH] introduce fmode_t, do annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c          | 10 +++++-----
 fs/fifo.c               |  6 +++---
 fs/file_table.c         |  4 ++--
 fs/hostfs/hostfs_kern.c |  5 +++--
 fs/locks.c              |  3 ++-
 fs/open.c               |  2 +-
 fs/proc/base.c          |  4 ++--
 fs/reiserfs/journal.c   |  2 +-
 8 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 218408eed1b..8897f3b02e9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -840,7 +840,7 @@ EXPORT_SYMBOL_GPL(bd_release_from_disk);
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
-struct block_device *open_by_devnum(dev_t dev, unsigned mode)
+struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
@@ -975,7 +975,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
 			int for_part);
 static int __blkdev_put(struct block_device *bdev, int for_part);
 
@@ -1104,7 +1104,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	return ret;
 }
 
-static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
 			int for_part)
 {
 	/*
@@ -1123,7 +1123,7 @@ static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
 	return do_open(bdev, &fake_file, for_part);
 }
 
-int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
+int blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags)
 {
 	return __blkdev_get(bdev, mode, flags, 0);
 }
@@ -1315,7 +1315,7 @@ EXPORT_SYMBOL(lookup_bdev);
 struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
 {
 	struct block_device *bdev;
-	mode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
 	bdev = lookup_bdev(path);
diff --git a/fs/fifo.c b/fs/fifo.c
index 987bf941149..f8f97b8b6d4 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -51,7 +51,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
 
 	switch (filp->f_mode) {
-	case 1:
+	case FMODE_READ:
 	/*
 	 *  O_RDONLY
 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
@@ -76,7 +76,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		}
 		break;
 	
-	case 2:
+	case FMODE_WRITE:
 	/*
 	 *  O_WRONLY
 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
@@ -98,7 +98,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		}
 		break;
 	
-	case 3:
+	case FMODE_READ | FMODE_WRITE:
 	/*
 	 *  O_RDWR
 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
diff --git a/fs/file_table.c b/fs/file_table.c
index f45a4493f9e..efc06faede6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL(get_empty_filp);
  * code should be moved into this function.
  */
 struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
-		mode_t mode, const struct file_operations *fop)
+		fmode_t mode, const struct file_operations *fop)
 {
 	struct file *file;
 	struct path;
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(alloc_file);
  * of this should be moving to alloc_file().
  */
 int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
-	   mode_t mode, const struct file_operations *fop)
+	   fmode_t mode, const struct file_operations *fop)
 {
 	int error = 0;
 	file->f_path.dentry = dentry;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d6ecabf4d23..7f34f4385de 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -20,7 +20,7 @@
 struct hostfs_inode_info {
 	char *host_filename;
 	int fd;
-	int mode;
+	fmode_t mode;
 	struct inode vfs_inode;
 };
 
@@ -373,7 +373,8 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 int hostfs_file_open(struct inode *ino, struct file *file)
 {
 	char *name;
-	int mode = 0, r = 0, w = 0, fd;
+	fmode_t mode = 0;
+	int r = 0, w = 0, fd;
 
 	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
 	if ((mode & HOSTFS_I(ino)->mode) == mode)
diff --git a/fs/locks.c b/fs/locks.c
index 5eb259e3cd3..20457486d6b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1580,7 +1580,8 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
 	cmd &= ~LOCK_NB;
 	unlock = (cmd == LOCK_UN);
 
-	if (!unlock && !(cmd & LOCK_MAND) && !(filp->f_mode & 3))
+	if (!unlock && !(cmd & LOCK_MAND) &&
+	    !(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
 		goto out_putf;
 
 	error = flock_make_lock(filp, &lock, cmd);
diff --git a/fs/open.c b/fs/open.c
index 5596049863b..83cdb9dee0c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -798,7 +798,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	int error;
 
 	f->f_flags = flags;
-	f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK |
+	f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b5918ae8ca7..486cf3fe713 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1712,9 +1712,9 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock;
-	if (file->f_mode & 1)
+	if (file->f_mode & FMODE_READ)
 		inode->i_mode |= S_IRUSR | S_IXUSR;
-	if (file->f_mode & 2)
+	if (file->f_mode & FMODE_WRITE)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index c21df71943a..b89d193a00d 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2593,7 +2593,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	int blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
-- 
cgit v1.2.3


From 86d434dede14108dd917b25af0f29c0cb28b8d18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2007 19:50:05 -0400
Subject: [PATCH] eliminate use of ->f_flags in block methods

store needed information in f_mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8897f3b02e9..b9022694e9f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1007,6 +1007,13 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		return ret;
 	}
 
+	if (file->f_flags & O_NDELAY)
+		file->f_mode |= FMODE_NDELAY;
+	if (file->f_flags & O_EXCL)
+		file->f_mode |= FMODE_EXCL;
+	if ((file->f_flags & O_ACCMODE) == 3)
+		file->f_mode |= FMODE_WRITE_IOCTL;
+
 	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
 
-- 
cgit v1.2.3


From 08f85851215100d0eebf026810955ee6ad456c38 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Oct 2007 13:26:20 -0400
Subject: [PATCH] move block_device_operations to blkdev.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/xip.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 4fb94c20041..b72b8588422 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -11,6 +11,7 @@
 #include <linux/buffer_head.h>
 #include <linux/ext2_fs_sb.h>
 #include <linux/ext2_fs.h>
+#include <linux/blkdev.h>
 #include "ext2.h"
 #include "xip.h"
 
-- 
cgit v1.2.3


From d4430d62fa77208824a37fe6f85ab2831d274769 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Mar 2008 09:09:22 -0500
Subject: [PATCH] beginning of methods conversion

To keep the size of changesets sane we split the switch by drivers;
to keep the damn thing bisectable we do the following:
	1) rename the affected methods, add ones with correct
prototypes, make (few) callers handle both.  That's this changeset.
	2) for each driver convert to new methods.  *ALL* drivers
are converted in this series.
	3) kill the old (renamed) methods.

Note that it _is_ a flagday; all in-tree drivers are converted and by the
end of this series no trace of old methods remain.  The only reason why
we do that this way is to keep the damn thing bisectable and allow per-driver
debugging if anything goes wrong.

New methods:
	open(bdev, mode)
	release(disk, mode)
	ioctl(bdev, mode, cmd, arg)		/* Called without BKL */
	compat_ioctl(bdev, mode, cmd, arg)
	locked_ioctl(bdev, mode, cmd, arg)	/* Called with BKL, legacy */

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b9022694e9f..73b6ce47c86 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1033,8 +1033,13 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
+			if (disk->fops->__open) {
+				ret = disk->fops->__open(bdev->bd_inode, file);
+				if (ret)
+					goto out_first;
+			}
 			if (disk->fops->open) {
-				ret = disk->fops->open(bdev->bd_inode, file);
+				ret = disk->fops->open(bdev, file->f_mode);
 				if (ret)
 					goto out_clear;
 			}
@@ -1074,8 +1079,13 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
+			if (bdev->bd_disk->fops->__open) {
+				ret = bdev->bd_disk->fops->__open(bdev->bd_inode, file);
+				if (ret)
+					goto out;
+			}
 			if (bdev->bd_disk->fops->open) {
-				ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
+				ret = bdev->bd_disk->fops->open(bdev, file->f_mode);
 				if (ret)
 					goto out_unlock_bdev;
 			}
@@ -1184,8 +1194,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		kill_bdev(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
+		if (disk->fops->__release)
+			ret = disk->fops->__release(bd_inode, NULL);
 		if (disk->fops->release)
-			ret = disk->fops->release(bd_inode, NULL);
+			ret = disk->fops->release(disk, 0);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
-- 
cgit v1.2.3


From 90b8f2824ce68dd87d304641a1d5a048dfff39f5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Mar 2008 10:43:36 -0500
Subject: [PATCH] end of methods switch: remove the old ones

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 73b6ce47c86..55124ac8c7a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1033,11 +1033,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
-			if (disk->fops->__open) {
-				ret = disk->fops->__open(bdev->bd_inode, file);
-				if (ret)
-					goto out_first;
-			}
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, file->f_mode);
 				if (ret)
@@ -1079,11 +1074,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
-			if (bdev->bd_disk->fops->__open) {
-				ret = bdev->bd_disk->fops->__open(bdev->bd_inode, file);
-				if (ret)
-					goto out;
-			}
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev, file->f_mode);
 				if (ret)
@@ -1180,7 +1170,6 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 static int __blkdev_put(struct block_device *bdev, int for_part)
 {
 	int ret = 0;
-	struct inode *bd_inode = bdev->bd_inode;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
@@ -1194,8 +1183,6 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		kill_bdev(bdev);
 	}
 	if (bdev->bd_contains == bdev) {
-		if (disk->fops->__release)
-			ret = disk->fops->__release(bd_inode, NULL);
 		if (disk->fops->release)
 			ret = disk->fops->release(disk, 0);
 	}
-- 
cgit v1.2.3


From 9a1c3542768b5a58e45a9216921cd10a3bae1205 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Feb 2008 20:40:24 -0500
Subject: [PATCH] pass fmode_t to blkdev_put()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c               | 22 +++++++++++-----------
 fs/ext3/super.c              |  4 ++--
 fs/ext4/super.c              |  4 ++--
 fs/jfs/jfs_logmgr.c          |  4 ++--
 fs/ocfs2/cluster/heartbeat.c |  4 ++--
 fs/partitions/check.c        |  2 +-
 fs/reiserfs/journal.c        |  4 ++--
 7 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 55124ac8c7a..05131baf3cf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -977,7 +977,7 @@ EXPORT_SYMBOL(bd_set_size);
 
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
 			int for_part);
-static int __blkdev_put(struct block_device *bdev, int for_part);
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
  * bd_mutex locking:
@@ -1095,7 +1095,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
-		__blkdev_put(bdev->bd_contains, 1);
+		__blkdev_put(bdev->bd_contains, file->f_mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
@@ -1163,11 +1163,11 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (!(res = bd_claim(bdev, filp)))
 		return 0;
 
-	blkdev_put(bdev);
+	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 
-static int __blkdev_put(struct block_device *bdev, int for_part)
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
@@ -1184,7 +1184,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-			ret = disk->fops->release(disk, 0);
+			ret = disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
@@ -1203,13 +1203,13 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
-		__blkdev_put(victim, 1);
+		__blkdev_put(victim, mode, 1);
 	return ret;
 }
 
-int blkdev_put(struct block_device *bdev)
+int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_put(bdev, 0);
+	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 
@@ -1218,7 +1218,7 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	if (bdev->bd_holder == filp)
 		bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, filp->f_mode);
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1343,7 +1343,7 @@ struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
 	return bdev;
 	
 blkdev_put:
-	blkdev_put(bdev);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
@@ -1359,7 +1359,7 @@ EXPORT_SYMBOL(open_bdev_excl);
 void close_bdev_excl(struct block_device *bdev)
 {
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, 0);	/* move up in the next patches */
 }
 
 EXPORT_SYMBOL(close_bdev_excl);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3a260af5544..15c38e69b69 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ fail:
 static int ext3_blkdev_put(struct block_device *bdev)
 {
 	bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -2066,7 +2066,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
 		        "EXT3: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9b2b2bc4ec1..c12cf7a657a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -399,7 +399,7 @@ fail:
 static int ext4_blkdev_put(struct block_device *bdev)
 {
 	bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -2553,7 +2553,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
 			"EXT4: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cd2ec2988b5..335c4de6552 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1168,7 +1168,7 @@ journal_found:
 	bd_release(bdev);
 
       close:		/* close external log device */
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1514,7 +1514,7 @@ int lmLogClose(struct super_block *sb)
 	rc = lmLogShutdown(log);
 
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 
 	kfree(log);
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 7dce1612553..4b6fdf591ee 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -976,7 +976,7 @@ static void o2hb_region_release(struct config_item *item)
 	}
 
 	if (reg->hr_bdev)
-		blkdev_put(reg->hr_bdev);
+		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 
 	if (reg->hr_slots)
 		kfree(reg->hr_slots);
@@ -1358,7 +1358,7 @@ out:
 		iput(inode);
 	if (ret < 0) {
 		if (reg->hr_bdev) {
-			blkdev_put(reg->hr_bdev);
+			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 			reg->hr_bdev = NULL;
 		}
 	}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index cfb0c80690a..5a35ff2e1a9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -488,7 +488,7 @@ void register_disk(struct gendisk *disk)
 	err = blkdev_get(bdev, FMODE_READ, 0);
 	if (err < 0)
 		goto exit;
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ);
 
 exit:
 	/* announce disk after possible partitions are created */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b89d193a00d..3261518478f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2575,7 +2575,7 @@ static int release_journal_dev(struct super_block *super,
 	if (journal->j_dev_bd != NULL) {
 		if (journal->j_dev_bd->bd_dev != super->s_dev)
 			bd_release(journal->j_dev_bd);
-		result = blkdev_put(journal->j_dev_bd);
+		result = blkdev_put(journal->j_dev_bd, 0); /* move up */
 		journal->j_dev_bd = NULL;
 	}
 
@@ -2618,7 +2618,7 @@ static int journal_init_dev(struct super_block *super,
 		} else if (jdev != super->s_dev) {
 			result = bd_claim(journal->j_dev_bd, journal);
 			if (result) {
-				blkdev_put(journal->j_dev_bd);
+				blkdev_put(journal->j_dev_bd, blkdev_mode);
 				return result;
 			}
 
-- 
cgit v1.2.3


From 30c40d2c01f68c7eb1a41ab3552bdaf5dbf300d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 22 Feb 2008 19:50:45 -0500
Subject: [PATCH] propagate mode through open_bdev_excl/close_bdev_excl

replace open_bdev_excl/close_bdev_excl with variants taking fmode_t.
superblock gets the value used to mount it stored in sb->s_mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c               | 24 +++++++++++-------------
 fs/reiserfs/journal.c        |  3 ++-
 fs/super.c                   | 14 ++++++++++----
 fs/xfs/linux-2.6/xfs_super.c |  4 ++--
 4 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 05131baf3cf..4b595904cef 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1309,32 +1309,29 @@ fail:
 EXPORT_SYMBOL(lookup_bdev);
 
 /**
- * open_bdev_excl  -  open a block device by name and set it up for use
+ * open_bdev_exclusive  -  open a block device by name and set it up for use
  *
  * @path:	special file representing the block device
- * @flags:	%MS_RDONLY for opening read-only
+ * @mode:	FMODE_... combination to pass be used
  * @holder:	owner for exclusion
  *
  * Open the blockdevice described by the special file at @path, claim it
  * for the @holder.
  */
-struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
+struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
 	struct block_device *bdev;
-	fmode_t mode = FMODE_READ;
 	int error = 0;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
-	if (!(flags & MS_RDONLY))
-		mode |= FMODE_WRITE;
 	error = blkdev_get(bdev, mode, 0);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
-	if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
 		goto blkdev_put;
 	error = bd_claim(bdev, holder);
 	if (error)
@@ -1347,22 +1344,23 @@ blkdev_put:
 	return ERR_PTR(error);
 }
 
-EXPORT_SYMBOL(open_bdev_excl);
+EXPORT_SYMBOL(open_bdev_exclusive);
 
 /**
- * close_bdev_excl  -  release a blockdevice openen by open_bdev_excl()
+ * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
  *
  * @bdev:	blockdevice to close
+ * @mode:	mode, must match that used to open.
  *
- * This is the counterpart to open_bdev_excl().
+ * This is the counterpart to open_bdev_exclusive().
  */
-void close_bdev_excl(struct block_device *bdev)
+void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
 {
 	bd_release(bdev);
-	blkdev_put(bdev, 0);	/* move up in the next patches */
+	blkdev_put(bdev, mode);
 }
 
-EXPORT_SYMBOL(close_bdev_excl);
+EXPORT_SYMBOL(close_bdev_exclusive);
 
 int __invalidate_device(struct block_device *bdev)
 {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3261518478f..70b89607667 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2628,7 +2628,8 @@ static int journal_init_dev(struct super_block *super,
 		return 0;
 	}
 
-	journal->j_dev_bd = open_bdev_excl(jdev_name, 0, journal);
+	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
+						FMODE_READ|FMODE_WRITE, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/fs/super.c b/fs/super.c
index e931ae9511f..0d77ac20d03 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -760,9 +760,13 @@ int get_sb_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
-	bdev = open_bdev_excl(dev_name, flags, fs_type);
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
@@ -785,11 +789,12 @@ int get_sb_bdev(struct file_system_type *fs_type,
 			goto error_bdev;
 		}
 
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, mode);
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags;
+		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -807,7 +812,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, mode);
 error:
 	return error;
 }
@@ -817,10 +822,11 @@ EXPORT_SYMBOL(get_sb_bdev);
 void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
+	fmode_t mode = sb->s_mode;
 
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, mode);
 }
 
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e39013619b2..37ebe36056e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -589,7 +589,7 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_excl(name, 0, mp);
+	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -603,7 +603,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 /*
-- 
cgit v1.2.3


From e5eb8caa83a76191feb9705c1a0a689ca260b91e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Oct 2007 13:24:05 -0400
Subject: [PATCH] remember mode of reiserfs journal

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/journal.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 70b89607667..9643c3bbeb3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2575,7 +2575,7 @@ static int release_journal_dev(struct super_block *super,
 	if (journal->j_dev_bd != NULL) {
 		if (journal->j_dev_bd->bd_dev != super->s_dev)
 			bd_release(journal->j_dev_bd);
-		result = blkdev_put(journal->j_dev_bd, 0); /* move up */
+		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
 
@@ -2608,6 +2608,7 @@ static int journal_init_dev(struct super_block *super,
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
 		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
@@ -2628,8 +2629,9 @@ static int journal_init_dev(struct super_block *super,
 		return 0;
 	}
 
+	journal->j_dev_mode = blkdev_mode;
 	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
-						FMODE_READ|FMODE_WRITE, journal);
+						blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
-- 
cgit v1.2.3


From 572c48921574dbe6dceb958cf965aa962baefde4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Oct 2007 13:24:05 -0400
Subject: [PATCH] sanitize blkdev_get() and friends

* get rid of fake struct file/struct dentry in __blkdev_get()
* merge __blkdev_get() and do_open()
* get rid of flags argument of blkdev_get()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c               | 65 +++++++++++++++-----------------------------
 fs/ocfs2/cluster/heartbeat.c |  2 +-
 fs/partitions/check.c        |  2 +-
 3 files changed, 24 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4b595904cef..b89c956e04f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -844,9 +844,8 @@ struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
-	int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
 	if (bdev)
-		err = blkdev_get(bdev, mode, flags);
+		err = blkdev_get(bdev, mode);
 	return err ? ERR_PTR(err) : bdev;
 }
 
@@ -975,8 +974,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
-			int for_part);
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -986,7 +983,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
  *    mutex_lock_nested(whole->bd_mutex, 1)
  */
 
-static int do_open(struct block_device *bdev, struct file *file, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
 	struct hd_struct *part = NULL;
@@ -994,9 +991,9 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	int partno;
 	int perm = 0;
 
-	if (file->f_mode & FMODE_READ)
+	if (mode & FMODE_READ)
 		perm |= MAY_READ;
-	if (file->f_mode & FMODE_WRITE)
+	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
 	/*
 	 * hooks: /n/, see "layering violations".
@@ -1007,15 +1004,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		return ret;
 	}
 
-	if (file->f_flags & O_NDELAY)
-		file->f_mode |= FMODE_NDELAY;
-	if (file->f_flags & O_EXCL)
-		file->f_mode |= FMODE_EXCL;
-	if ((file->f_flags & O_ACCMODE) == 3)
-		file->f_mode |= FMODE_WRITE_IOCTL;
-
 	ret = -ENXIO;
-	file->f_mapping = bdev->bd_inode->i_mapping;
 
 	lock_kernel();
 
@@ -1034,7 +1023,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
-				ret = disk->fops->open(bdev, file->f_mode);
+				ret = disk->fops->open(bdev, mode);
 				if (ret)
 					goto out_clear;
 			}
@@ -1054,7 +1043,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (!whole)
 				goto out_clear;
 			BUG_ON(for_part);
-			ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
+			ret = __blkdev_get(whole, mode, 1);
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
@@ -1075,7 +1064,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
-				ret = bdev->bd_disk->fops->open(bdev, file->f_mode);
+				ret = bdev->bd_disk->fops->open(bdev, mode);
 				if (ret)
 					goto out_unlock_bdev;
 			}
@@ -1095,7 +1084,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
-		__blkdev_put(bdev->bd_contains, file->f_mode, 1);
+		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
@@ -1111,28 +1100,9 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	return ret;
 }
 
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags,
-			int for_part)
-{
-	/*
-	 * This crockload is due to bad choice of ->open() type.
-	 * It will go away.
-	 * For now, block device ->open() routine must _not_
-	 * examine anything in 'inode' argument except ->i_rdev.
-	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_path.dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file, for_part);
-}
-
-int blkdev_get(struct block_device *bdev, fmode_t mode, unsigned flags)
+int blkdev_get(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_get(bdev, mode, flags, 0);
+	return __blkdev_get(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_get);
 
@@ -1149,15 +1119,24 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	 */
 	filp->f_flags |= O_LARGEFILE;
 
+	if (filp->f_flags & O_NDELAY)
+		filp->f_mode |= FMODE_NDELAY;
+	if (filp->f_flags & O_EXCL)
+		filp->f_mode |= FMODE_EXCL;
+	if ((filp->f_flags & O_ACCMODE) == 3)
+		filp->f_mode |= FMODE_WRITE_IOCTL;
+
 	bdev = bd_acquire(inode);
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	res = do_open(bdev, filp, 0);
+	filp->f_mapping = bdev->bd_inode->i_mapping;
+
+	res = blkdev_get(bdev, filp->f_mode);
 	if (res)
 		return res;
 
-	if (!(filp->f_flags & O_EXCL) )
+	if (!(filp->f_mode & FMODE_EXCL))
 		return 0;
 
 	if (!(res = bd_claim(bdev, filp)))
@@ -1327,7 +1306,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
 	if (IS_ERR(bdev))
 		return bdev;
 
-	error = blkdev_get(bdev, mode, 0);
+	error = blkdev_get(bdev, mode);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4b6fdf591ee..6ebaa58e2c0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1268,7 +1268,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5a35ff2e1a9..633f7a0ebb2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -485,7 +485,7 @@ void register_disk(struct gendisk *disk)
 		goto exit;
 
 	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ, 0);
+	err = blkdev_get(bdev, FMODE_READ);
 	if (err < 0)
 		goto exit;
 	blkdev_put(bdev, FMODE_READ);
-- 
cgit v1.2.3


From 56b26add02b4bdea81d5e0ebda60db1fe3311ad4 Mon Sep 17 00:00:00 2001
From: Al Viro <al@aretha.pdmi.ras.ru>
Date: Fri, 19 Sep 2008 03:17:36 -0400
Subject: [PATCH] kill the rest of struct file propagation in block ioctls

Now we can switch blkdev_ioctl() block_device/mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b89c956e04f..05865b93f7e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1202,7 +1202,11 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+	fmode_t mode = file->f_mode;
+	if (file->f_flags & O_NDELAY)
+		mode |= FMODE_NDELAY_NOW;
+	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
 static const struct address_space_operations def_blk_aops = {
@@ -1238,7 +1242,7 @@ int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
 	int res;
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg);
+	res = blkdev_ioctl(bdev, 0, cmd, arg);
 	set_fs(old_fs);
 	return res;
 }
-- 
cgit v1.2.3


From f04de505e3fa322728d1a851e08bf7060b117743 Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 21 Oct 2008 13:25:51 +0100
Subject: [JFFS2] Fix build failure with !CONFIG_JFFS2_FS_WRITEBUFFER

Build failure introduced by 5bf1723723487ddb0b9c9641b6559da96b27cc93
[JFFS2] Write buffer offset adjustment for NOR-ECC (Sibley) flash

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/nodemgmt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 0875b60b4bf..21a052915aa 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,9 +261,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 
 	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
 
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* adjust write buffer offset, else we get a non contiguous write bug */
 	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
 		c->wbuf_ofs = 0xffffffff;
+#endif
 
 	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
 
-- 
cgit v1.2.3


From a364bc0b37f14ffd66c1f982af42990a9d77fa43 Mon Sep 17 00:00:00 2001
From: Jeff Layton <sfrench@us.ibm.com>
Date: Tue, 21 Oct 2008 14:42:13 +0000
Subject: [CIFS] fix saving of resume key before CIFSFindNext

We recently fixed the cifs readdir code so that it saves the resume key
before calling CIFSFindNext. Unfortunately, this assumes that we have
just done a CIFSFindFirst (or FindNext) and have resume info to save.
This isn't necessarily the case. Fix the code to save resume info if we
had to reinitiate the search, and after a FindNext.

This fixes connectathon basic test6 against NetApp filers.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
CC: Stable <stable@kernel.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 765adf12d54..58d57299f2a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -762,14 +762,15 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 				 rc));
 			return rc;
 		}
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
 	}
 
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
 	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
 		cFYI(1, ("calling findnext2"));
-		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
 		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
 				  &cifsFile->srch_inf);
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
 		if (rc)
 			return -ENOENT;
 	}
-- 
cgit v1.2.3


From 1cd9cd161c89f569b90583b7797bd972c3bf0cff Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 22 Oct 2008 13:12:36 -0400
Subject: NFSD: Fix BUG during NFSD shutdown processing

The Linux NFS server can be started via a user-space write to
/proc/fs/nfs/threads or to /proc/fs/nfs/portlist.  In the first case,
all default listeners are started (both UDP and TCP).  In the second,
a listener is started only for one specified transport.

The NFS server has to make sure lockd stays up until the last listener
transport goes away.  To support both start-up interfaces, it should
do one lockd_up() for each NFSD listener.

The nfsd_init_socks() function used to do one lockd_up() call for each
svc_create_xprt().  Recently commit
26a414092353590ceaa5955bcb53f863d6ea7549 mistakenly changed
nfsd_init_socks() to do only one lockd_up() call even though it still
does two svc_create_xprt() calls.

The end result is a lockd_down() BUG during NFSD shutdown processing
because nfsd_last_threads() does a lockd_down() call for each entry
on the sv_permsocks list, but the start-up code doesn't do a matching
number of lockd_up() calls.

Add a second lockd_up() in nfsd_init_socks() to make sure the number
of lockd_up() calls matches the number of entries on the NFS servers's
sv_permsocks list.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 59eeb46f82c..07e4f5d7baa 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -249,6 +249,10 @@ static int nfsd_init_socks(int port)
 	if (error < 0)
 		return error;
 
+	error = lockd_up();
+	if (error < 0)
+		return error;
+
 	error = svc_create_xprt(nfsd_serv, "tcp", port,
 					SVC_SOCK_DEFAULTS);
 	if (error < 0)
-- 
cgit v1.2.3


From 6c6a426fdcb374b7641d7cf9eea88410828b9d9a Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Wed, 22 Oct 2008 14:48:36 +0530
Subject: nfsd: Fix memory leak in nfsd_getxattr

Fix a memory leak in nfsd_getxattr. nfsd_getxattr should free up memory
	that it allocated if vfs_getxattr fails.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index aa1d0d6489a..9609eb51d72 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -410,6 +410,7 @@ out_nfserr:
 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 {
 	ssize_t buflen;
+	ssize_t ret;
 
 	buflen = vfs_getxattr(dentry, key, NULL, 0);
 	if (buflen <= 0)
@@ -419,7 +420,10 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 	if (!*buf)
 		return -ENOMEM;
 
-	return vfs_getxattr(dentry, key, *buf, buflen);
+	ret = vfs_getxattr(dentry, key, *buf, buflen);
+	if (ret < 0)
+		kfree(*buf);
+	return ret;
 }
 #endif
 
-- 
cgit v1.2.3


From 6dfcde98a299196f13dd66417663a819f0ac4156 Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Mon, 20 Oct 2008 11:44:40 +0530
Subject: nfsd: Drop reference in expkey_parse error cases

Drop reference to export key on error. Compile tested.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/export.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9dc036f1835..7ce2c6e4e23 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -151,8 +151,10 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* now we want a pathname, or empty meaning NEGATIVE  */
 	err = -EINVAL;
-	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) < 0)
+	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) < 0) {
+		cache_put(&ek->h, &svc_expkey_cache);
 		goto out;
+	}
 	dprintk("Path seems to be <%s>\n", buf);
 	err = 0;
 	if (len == 0) {
@@ -164,8 +166,10 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	} else {
 		struct nameidata nd;
 		err = path_lookup(buf, 0, &nd);
-		if (err)
+		if (err) {
+			cache_put(&ek->h, &svc_expkey_cache);
 			goto out;
+		}
 
 		dprintk("Found the path %s\n", buf);
 		key.ek_path = nd.path;
-- 
cgit v1.2.3


From 30bc4dfd3b64eb1fbefe2c63e30d8fc129273e20 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 20 Oct 2008 16:34:21 -0400
Subject: nfsd: clean up expkey_parse error cases

We might as well do all of these at the end.  Fix up a couple minor
style nits while we're there.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/export.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 7ce2c6e4e23..5cd882b8871 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -99,7 +99,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	int fsidtype;
 	char *ep;
 	struct svc_expkey key;
-	struct svc_expkey *ek;
+	struct svc_expkey *ek = NULL;
 
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
@@ -107,7 +107,8 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	err = -ENOMEM;
-	if (!buf) goto out;
+	if (!buf)
+		goto out;
 
 	err = -EINVAL;
 	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
@@ -151,38 +152,34 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* now we want a pathname, or empty meaning NEGATIVE  */
 	err = -EINVAL;
-	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) < 0) {
-		cache_put(&ek->h, &svc_expkey_cache);
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len < 0)
 		goto out;
-	}
 	dprintk("Path seems to be <%s>\n", buf);
 	err = 0;
 	if (len == 0) {
 		set_bit(CACHE_NEGATIVE, &key.h.flags);
 		ek = svc_expkey_update(&key, ek);
-		if (ek)
-			cache_put(&ek->h, &svc_expkey_cache);
-		else err = -ENOMEM;
+		if (!ek)
+			err = -ENOMEM;
 	} else {
 		struct nameidata nd;
 		err = path_lookup(buf, 0, &nd);
-		if (err) {
-			cache_put(&ek->h, &svc_expkey_cache);
+		if (err)
 			goto out;
-		}
 
 		dprintk("Found the path %s\n", buf);
 		key.ek_path = nd.path;
 
 		ek = svc_expkey_update(&key, ek);
-		if (ek)
-			cache_put(&ek->h, &svc_expkey_cache);
-		else
+		if (!ek)
 			err = -ENOMEM;
 		path_put(&nd.path);
 	}
 	cache_flush();
  out:
+	if (ek)
+		cache_put(&ek->h, &svc_expkey_cache);
 	if (dom)
 		auth_domain_put(dom);
 	kfree(buf);
-- 
cgit v1.2.3


From ea2e7996fc892e9becfed9145fdcefd59f697718 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Wed, 22 Oct 2008 18:48:45 -0500
Subject: 9p: fix format warning

This patch fixes a format warning which appears on 64-bit builds.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 041c5269228..68bf2af6c38 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -178,7 +178,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 	int ret;
 	struct p9_fid *fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "count %d offset %lld\n", count, *offset);
+	P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
 	fid = filp->private_data;
 
 	if (count > (fid->clnt->msize - P9_IOHDRSZ))
-- 
cgit v1.2.3


From 84210e9120a8c01a14379ba1f9a9b0f963641d94 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 23 Oct 2008 04:42:37 +0000
Subject: [CIFS] improve setlease handling

fcntl(F_SETLEASE) currently is not exported by cifs (nor by local file
systems) so cifs grants leases based on how other local processes have
opened the file not by whether the file is cacheable (oplocked).  This
adds the check to make sure that the file is cacheable on the client
before checking whether we can grant the lease locally
(generic_setlease).  It also adds a mount option for cifs (locallease)
if the user wants to override this and try to grant leases even
if the server did not grant oplock.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES    |  3 ++-
 fs/cifs/README     | 16 ++++++++++++++++
 fs/cifs/cifsfs.c   | 41 +++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  |  8 +++++++-
 5 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index a7255751ffb..8f528ea24c4 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,8 @@ Version 1.55
 Various fixes to make delete of open files behavior more predictable
 (when delete of an open file fails we mark the file as "delete-on-close"
 in a way that more servers accept, but only if we can first rename the
-file to a temporary name)
+file to a temporary name).  Add experimental support for more safely
+handling fcntl(F_SETLEASE).
 
 Version 1.54
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index cbe26fe4012..a439dc1739b 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -491,6 +491,19 @@ A partial list of the supported mount options follows:
 		Note that this differs from the sign mount option in that it
 		causes encryption of data sent over this mounted share but other
 		shares mounted to the same server are unaffected.
+ locallease     This option is rarely needed. Fcntl F_SETLEASE is
+		used by some applications such as Samba and NFSv4 server to
+		check to see whether a file is cacheable.  CIFS has no way
+		to explicitly request a lease, but can check whether a file
+		is cacheable (oplocked).  Unfortunately, even if a file
+		is not oplocked, it could still be cacheable (ie cifs client
+		could grant fcntl leases if no other local processes are using
+		the file) for cases for example such as when the server does not
+		support oplocks and the user is sure that the only updates to
+		the file will be from this client. Specifying this mount option
+		will allow the cifs client to check for leases (only) locally
+		for files which are not oplocked instead of denying leases
+		in that case. (EXPERIMENTAL)
  sec            Security mode.  Allowed values are:
 			none	attempt to connection as a null user (no name)
 			krb5    Use Kerberos version 5 authentication
@@ -641,6 +654,9 @@ requires enabling CONFIG_CIFS_EXPERIMENTAL
 	cifsacl support needed to retrieve approximated mode bits based on
 		the contents on the CIFS ACL.
 
+	lease support: cifs will check the oplock state before calling into
+	the vfs to see if we can grant a lease on a file.
+
 	DNOTIFY fcntl: needed for support of directory change 
 			    notification and perhaps later for file leases)
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c6aad775dd6..76919c25acc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -618,6 +618,37 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
+{
+	/* note that this is called by vfs setlease with the BKL held
+	   although I doubt that BKL is needed here in cifs */
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	if (!(S_ISREG(inode->i_mode)))
+		return -EINVAL;
+
+	/* check if file is oplocked */
+	if (((arg == F_RDLCK) &&
+		(CIFS_I(inode)->clientCanCacheRead)) ||
+	    ((arg == F_WRLCK) &&
+		(CIFS_I(inode)->clientCanCacheAll)))
+		return generic_setlease(file, arg, lease);
+	else if (CIFS_SB(inode->i_sb)->tcon->local_lease &&
+			!CIFS_I(inode)->clientCanCacheRead)
+		/* If the server claims to support oplock on this
+		   file, then we still need to check oplock even
+		   if the local_lease mount option is set, but there
+		   are servers which do not support oplock for which
+		   this mount option may be useful if the user
+		   knows that the file won't be changed on the server
+		   by anyone else */
+		return generic_setlease(file, arg, lease);
+	else
+		return -EAGAIN;
+}
+#endif
+
 struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "cifs",
@@ -696,6 +727,7 @@ const struct file_operations cifs_file_ops = {
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
@@ -716,6 +748,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
@@ -736,6 +769,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
@@ -755,6 +789,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
@@ -946,6 +981,12 @@ static int cifs_oplock_thread(void *dummyarg)
 				the call */
 			/* mutex_lock(&inode->i_mutex);*/
 			if (S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+				if (CIFS_I(inode)->clientCanCacheAll == 0)
+					break_lease(inode, FMODE_READ);
+				else if (CIFS_I(inode)->clientCanCacheRead == 0)
+					break_lease(inode, FMODE_WRITE);
+#endif
 				rc = filemap_fdatawrite(inode->i_mapping);
 				if (CIFS_I(inode)->clientCanCacheRead == 0) {
 					waitrc = filemap_fdatawait(
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 178f733a368..c791e5b5a91 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -285,6 +285,7 @@ struct cifsTconInfo {
 	bool seal:1;      /* transport encryption for this mounted share */
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
+	bool local_lease:1; /* check leases (only) on local system not remote */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1126f7ab460..f51b79a67e1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -90,7 +90,8 @@ struct smb_vol {
 	bool nocase:1;     /* request case insensitive filenames */
 	bool nobrl:1;      /* disable sending byte range locks to srv */
 	bool seal:1;       /* request transport encryption on share */
-	bool nodfs:1;
+	bool nodfs:1;      /* Do not request DFS, even if available */
+	bool local_lease:1; /* check leases only on local system, not remote */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -1264,6 +1265,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->no_psx_acl = 0;
 		} else if (strnicmp(data, "noacl", 5) == 0) {
 			vol->no_psx_acl = 1;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		} else if (strnicmp(data, "locallease", 6) == 0) {
+			vol->local_lease = 1;
+#endif
 		} else if (strnicmp(data, "sign", 4) == 0) {
 			vol->secFlg |= CIFSSEC_MUST_SIGN;
 		} else if (strnicmp(data, "seal", 4) == 0) {
@@ -2162,6 +2167,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   for the retry flag is used */
 			tcon->retry = volume_info.retry;
 			tcon->nocase = volume_info.nocase;
+			tcon->local_lease = volume_info.local_lease;
 			if (tcon->seal != volume_info.seal)
 				cERROR(1, ("transport encryption setting "
 					   "conflicts with existing tid"));
-- 
cgit v1.2.3


From 8d281efb67463fe8aac8f6e10b31492fc218bf2b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 22 Oct 2008 13:57:01 -0400
Subject: cifs: fix unlinking of rename target when server doesn't support open
 file renames

cifs: fix unlinking of rename target when server doesn't support open file renames

The patch to make cifs_rename undoable broke renaming one file on top of
another when the server doesn't support busy file renames. Remove the
code that uses busy file renames to unlink the target file, and just
have it call cifs_unlink. If the rename of the source file fails, then
the unlink won't be undoable, but hopefully that's rare enough that it
won't be a problem.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 87 ++++++---------------------------------------------------
 1 file changed, 8 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8ac4eabc1f8..d54fa8aeaea 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1293,12 +1293,9 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	struct cifs_sb_info *cifs_sb_source;
 	struct cifs_sb_info *cifs_sb_target;
 	struct cifsTconInfo *tcon;
-	struct cifsInodeInfo *target_cinode;
 	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
 	FILE_UNIX_BASIC_INFO *info_buf_target;
-	__u16 dstfid;
-	int xid, rc, tmprc, oplock = 0;
-	bool delete_already_pending;
+	int xid, rc, tmprc;
 
 	cifs_sb_target = CIFS_SB(target_dir->i_sb);
 	cifs_sb_source = CIFS_SB(source_dir->i_sb);
@@ -1348,104 +1345,36 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 		}
 
 		info_buf_target = info_buf_source + 1;
-		rc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
 					info_buf_source,
 					cifs_sb_source->local_nls,
 					cifs_sb_source->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0)
+		if (tmprc != 0)
 			goto unlink_target;
 
-		rc = CIFSSMBUnixQPathInfo(xid, tcon,
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon,
 					toName, info_buf_target,
 					cifs_sb_target->local_nls,
 					/* remap based on source sb */
 					cifs_sb_source->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
-		if (rc == 0 && (info_buf_source->UniqueId ==
-				info_buf_target->UniqueId))
+		if (tmprc == 0 && (info_buf_source->UniqueId ==
+				   info_buf_target->UniqueId))
 			/* same file, POSIX says that this is a noop */
 			goto cifs_rename_exit;
-
-		rc = -EEXIST;
 	} /* else ... BB we could add the same check for Windows by
 		     checking the UniqueId via FILE_INTERNAL_INFO */
 
-	if ((rc == -EACCES) || (rc == -EEXIST)) {
 unlink_target:
-		/* don't bother if this is a negative dentry */
-		if (!target_dentry->d_inode)
-			goto cifs_rename_exit;
-
-		target_cinode = CIFS_I(target_dentry->d_inode);
-
-		/* try to move the target out of the way */
-		tmprc = CIFSSMBOpen(xid, tcon, toName, FILE_OPEN, DELETE,
-				    CREATE_NOT_DIR, &dstfid, &oplock, NULL,
-				    cifs_sb_target->local_nls,
-				    cifs_sb_target->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if ((rc == -EACCES) || (rc == -EEXIST)) {
+		tmprc = cifs_unlink(target_dir, target_dentry);
 		if (tmprc)
 			goto cifs_rename_exit;
 
-		/* rename the file to random name */
-		tmprc = CIFSSMBRenameOpenFile(xid, tcon, dstfid, NULL,
-					      cifs_sb_target->local_nls,
-					      cifs_sb_target->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-
-		if (tmprc)
-			goto close_target;
-
-		delete_already_pending = target_cinode->delete_pending;
-
-		if (!delete_already_pending) {
-			/* set delete on close */
-			tmprc = CIFSSMBSetFileDisposition(xid, tcon,
-							  true, dstfid,
-							  current->tgid);
-			/*
-			 * This hack is for broken samba servers, remove this
-			 * once more fixed ones are in the field.
-			 */
-			if (tmprc == -ENOENT)
-				delete_already_pending = false;
-			else if (tmprc)
-				goto undo_target_rename;
-
-			target_cinode->delete_pending = true;
-		}
-
-
 		rc = cifs_do_rename(xid, source_dentry, fromName,
 				    target_dentry, toName);
-
-		if (rc == 0)
-			goto close_target;
-
-		/*
-		 * after this point, we can't bother with error handling on
-		 * the undo's. This is best effort since we can't do anything
-		 * about failures here.
-		 */
-		if (!delete_already_pending) {
-			tmprc = CIFSSMBSetFileDisposition(xid, tcon,
-							  false, dstfid,
-							  current->tgid);
-			if (tmprc == 0)
-				target_cinode->delete_pending = false;
-		}
-
-undo_target_rename:
-		/* rename failed: undo target rename */
-		CIFSSMBRenameOpenFile(xid, tcon, dstfid,
-				      target_dentry->d_name.name,
-				      cifs_sb_target->local_nls,
-				      cifs_sb_target->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-close_target:
-		CIFSSMBClose(xid, tcon, dstfid);
 	}
 
 cifs_rename_exit:
-- 
cgit v1.2.3


From b1c8d2b421376bc941823ee93e36cb491609b02c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 22 Oct 2008 13:57:07 -0400
Subject: cifs: handle the TCP_Server_Info->tsk field more carefully

cifs: handle the TCP_Server_Info->tsk field more carefully

We currently handle the TCP_Server_Info->tsk field without any locking,
but with some half-measures to try and prevent races. These aren't
really sufficient though. When taking down cifsd, use xchg() to swap
the contents of the tsk field with NULL so we don't end up trying
to send it more than one signal. Also, don't allow cifsd to exit until
the signal is received if we expect one.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f51b79a67e1..71b7661e226 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -750,6 +750,7 @@ multi_t2_fnd:
 	write_unlock(&GlobalSMBSeslock);
 
 	kfree(server->hostname);
+	task_to_wake = xchg(&server->tsk, NULL);
 	kfree(server);
 
 	length = atomic_dec_return(&tcpSesAllocCount);
@@ -757,6 +758,16 @@ multi_t2_fnd:
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
 				GFP_KERNEL);
 
+	/* if server->tsk was NULL then wait for a signal before exiting */
+	if (!task_to_wake) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!signal_pending(current)) {
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+		set_current_state(TASK_RUNNING);
+	}
+
 	return 0;
 }
 
@@ -1846,6 +1857,16 @@ convert_delimiter(char *path, char delim)
 	}
 }
 
+static void
+kill_cifsd(struct TCP_Server_Info *server)
+{
+	struct task_struct *task;
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -2233,7 +2254,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			spin_lock(&GlobalMid_Lock);
 			srvTcp->tcpStatus = CifsExiting;
 			spin_unlock(&GlobalMid_Lock);
-			force_sig(SIGKILL, srvTcp->tsk);
+			kill_cifsd(srvTcp);
 		}
 		 /* If find_unc succeeded then rc == 0 so we can not end */
 		if (tcon)  /* up accidently freeing someone elses tcon struct */
@@ -2246,19 +2267,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
 					/* if the socketUseCount is now zero */
 					if ((temp_rc == -ESHUTDOWN) &&
-					    (pSesInfo->server) &&
-					    (pSesInfo->server->tsk))
-						force_sig(SIGKILL,
-							pSesInfo->server->tsk);
+					    (pSesInfo->server))
+						kill_cifsd(pSesInfo->server);
 				} else {
 					cFYI(1, ("No session or bad tcon"));
-					if ((pSesInfo->server) &&
-					    (pSesInfo->server->tsk)) {
+					if (pSesInfo->server) {
 						spin_lock(&GlobalMid_Lock);
 						srvTcp->tcpStatus = CifsExiting;
 						spin_unlock(&GlobalMid_Lock);
-						force_sig(SIGKILL,
-							pSesInfo->server->tsk);
+						kill_cifsd(pSesInfo->server);
 					}
 				}
 				sesInfoFree(pSesInfo);
@@ -3545,7 +3562,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	int rc = 0;
 	int xid;
 	struct cifsSesInfo *ses = NULL;
-	struct task_struct *cifsd_task;
 	char *tmp;
 
 	xid = GetXid();
@@ -3561,7 +3577,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 		tconInfoFree(cifs_sb->tcon);
 		if ((ses) && (ses->server)) {
 			/* save off task so we do not refer to ses later */
-			cifsd_task = ses->server->tsk;
 			cFYI(1, ("About to do SMBLogoff "));
 			rc = CIFSSMBLogoff(xid, ses);
 			if (rc == -EBUSY) {
@@ -3569,8 +3584,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 				return 0;
 			} else if (rc == -ESHUTDOWN) {
 				cFYI(1, ("Waking up socket by sending signal"));
-				if (cifsd_task)
-					force_sig(SIGKILL, cifsd_task);
+				if (ses->server)
+					kill_cifsd(ses->server);
 				rc = 0;
 			} /* else - we have an smb session
 				left on this socket do not kill cifsd */
-- 
cgit v1.2.3


From d181146572c4fa9af2a068b967cb53dcac7da944 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 00:49:18 -0400
Subject: [PATCH] new helper - kern_path()

Analog of lookup_path(), takes struct path *.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 4ea63ed5e79..4a56f9b59e8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1106,6 +1106,15 @@ int path_lookup(const char *name, unsigned int flags,
 	return do_path_lookup(AT_FDCWD, name, flags, nd);
 }
 
+int kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	struct nameidata nd;
+	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
+	if (!res)
+		*path = nd.path;
+	return res;
+}
+
 /**
  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
  * @dentry:  pointer to dentry of the base directory
@@ -2855,6 +2864,7 @@ EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(vfs_permission);
-- 
cgit v1.2.3


From 2d92ab3c6279f8423b20cf91574d0ad6696d2b44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 00:51:11 -0400
Subject: [PATCH] finally get rid of nameidata in namespace.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 119 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 59 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 6e283c93b50..9f6005e5586 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1167,19 +1167,19 @@ asmlinkage long sys_oldumount(char __user * name)
 
 #endif
 
-static int mount_is_safe(struct nameidata *nd)
+static int mount_is_safe(struct path *path)
 {
 	if (capable(CAP_SYS_ADMIN))
 		return 0;
 	return -EPERM;
 #ifdef notyet
-	if (S_ISLNK(nd->path.dentry->d_inode->i_mode))
+	if (S_ISLNK(path->dentry->d_inode->i_mode))
 		return -EPERM;
-	if (nd->path.dentry->d_inode->i_mode & S_ISVTX) {
-		if (current->uid != nd->path.dentry->d_inode->i_uid)
+	if (path->dentry->d_inode->i_mode & S_ISVTX) {
+		if (current->uid != path->dentry->d_inode->i_uid)
 			return -EPERM;
 	}
-	if (vfs_permission(nd, MAY_WRITE))
+	if (inode_permission(path->dentry->d_inode, MAY_WRITE))
 		return -EPERM;
 	return 0;
 #endif
@@ -1427,9 +1427,9 @@ out_unlock:
  * recursively change the type of the mountpoint.
  * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_change_type(struct nameidata *nd, int flag)
+static noinline int do_change_type(struct path *path, int flag)
 {
-	struct vfsmount *m, *mnt = nd->path.mnt;
+	struct vfsmount *m, *mnt = path->mnt;
 	int recurse = flag & MS_REC;
 	int type = flag & ~MS_REC;
 	int err = 0;
@@ -1437,7 +1437,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (nd->path.dentry != nd->path.mnt->mnt_root)
+	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
 	down_write(&namespace_sem);
@@ -1461,38 +1461,38 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
  * do loopback mount.
  * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_loopback(struct nameidata *nd, char *old_name,
+static noinline int do_loopback(struct path *path, char *old_name,
 				int recurse)
 {
-	struct nameidata old_nd;
+	struct path old_path;
 	struct vfsmount *mnt = NULL;
-	int err = mount_is_safe(nd);
+	int err = mount_is_safe(path);
 	if (err)
 		return err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
-	err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 
 	down_write(&namespace_sem);
 	err = -EINVAL;
-	if (IS_MNT_UNBINDABLE(old_nd.path.mnt))
+	if (IS_MNT_UNBINDABLE(old_path.mnt))
 		goto out;
 
-	if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt))
+	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
 
 	err = -ENOMEM;
 	if (recurse)
-		mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0);
+		mnt = copy_tree(old_path.mnt, old_path.dentry, 0);
 	else
-		mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0);
+		mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
 
 	if (!mnt)
 		goto out;
 
-	err = graft_tree(mnt, &nd->path);
+	err = graft_tree(mnt, path);
 	if (err) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
@@ -1503,7 +1503,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name,
 
 out:
 	up_write(&namespace_sem);
-	path_put(&old_nd.path);
+	path_put(&old_path);
 	return err;
 }
 
@@ -1530,31 +1530,31 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
  * on it - tough luck.
  * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+static noinline int do_remount(struct path *path, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
-	struct super_block *sb = nd->path.mnt->mnt_sb;
+	struct super_block *sb = path->mnt->mnt_sb;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!check_mnt(nd->path.mnt))
+	if (!check_mnt(path->mnt))
 		return -EINVAL;
 
-	if (nd->path.dentry != nd->path.mnt->mnt_root)
+	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
-		err = change_mount_flags(nd->path.mnt, flags);
+		err = change_mount_flags(path->mnt, flags);
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err)
-		nd->path.mnt->mnt_flags = mnt_flags;
+		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
 	if (!err)
-		security_sb_post_remount(nd->path.mnt, flags, data);
+		security_sb_post_remount(path->mnt, flags, data);
 	return err;
 }
 
@@ -1571,78 +1571,77 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)
 /*
  * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_move_mount(struct nameidata *nd, char *old_name)
+static noinline int do_move_mount(struct path *path, char *old_name)
 {
-	struct nameidata old_nd;
-	struct path parent_path;
+	struct path old_path, parent_path;
 	struct vfsmount *p;
 	int err = 0;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (!old_name || !*old_name)
 		return -EINVAL;
-	err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 
 	down_write(&namespace_sem);
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path.mnt, &nd->path.dentry))
+	while (d_mountpoint(path->dentry) &&
+	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
-	if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt))
+	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
 
 	err = -ENOENT;
-	mutex_lock(&nd->path.dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(nd->path.dentry->d_inode))
+	mutex_lock(&path->dentry->d_inode->i_mutex);
+	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out1;
 
-	if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry))
+	if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
 		goto out1;
 
 	err = -EINVAL;
-	if (old_nd.path.dentry != old_nd.path.mnt->mnt_root)
+	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
 
-	if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent)
+	if (old_path.mnt == old_path.mnt->mnt_parent)
 		goto out1;
 
-	if (S_ISDIR(nd->path.dentry->d_inode->i_mode) !=
-	      S_ISDIR(old_nd.path.dentry->d_inode->i_mode))
+	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
+	      S_ISDIR(old_path.dentry->d_inode->i_mode))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
-	if (old_nd.path.mnt->mnt_parent &&
-	    IS_MNT_SHARED(old_nd.path.mnt->mnt_parent))
+	if (old_path.mnt->mnt_parent &&
+	    IS_MNT_SHARED(old_path.mnt->mnt_parent))
 		goto out1;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
-	if (IS_MNT_SHARED(nd->path.mnt) &&
-	    tree_contains_unbindable(old_nd.path.mnt))
+	if (IS_MNT_SHARED(path->mnt) &&
+	    tree_contains_unbindable(old_path.mnt))
 		goto out1;
 	err = -ELOOP;
-	for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent)
-		if (p == old_nd.path.mnt)
+	for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent)
+		if (p == old_path.mnt)
 			goto out1;
 
-	err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path);
+	err = attach_recursive_mnt(old_path.mnt, path, &parent_path);
 	if (err)
 		goto out1;
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
-	list_del_init(&old_nd.path.mnt->mnt_expire);
+	list_del_init(&old_path.mnt->mnt_expire);
 out1:
-	mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
 out:
 	up_write(&namespace_sem);
 	if (!err)
 		path_put(&parent_path);
-	path_put(&old_nd.path);
+	path_put(&old_path);
 	return err;
 }
 
@@ -1651,7 +1650,7 @@ out:
  * namespace's tree
  * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
+static noinline int do_new_mount(struct path *path, char *type, int flags,
 			int mnt_flags, char *name, void *data)
 {
 	struct vfsmount *mnt;
@@ -1667,7 +1666,7 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, &nd->path, mnt_flags, NULL);
+	return do_add_mount(mnt, path, mnt_flags, NULL);
 }
 
 /*
@@ -1902,7 +1901,7 @@ int copy_mount_options(const void __user * data, unsigned long *where)
 long do_mount(char *dev_name, char *dir_name, char *type_page,
 		  unsigned long flags, void *data_page)
 {
-	struct nameidata nd;
+	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
 
@@ -1940,29 +1939,29 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
 
 	/* ... and get the mountpoint */
-	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
+	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
 	if (retval)
 		return retval;
 
-	retval = security_sb_mount(dev_name, &nd.path,
+	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
 	if (retval)
 		goto dput_out;
 
 	if (flags & MS_REMOUNT)
-		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+		retval = do_loopback(&path, dev_name, flags & MS_REC);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
-		retval = do_change_type(&nd, flags);
+		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
-		retval = do_move_mount(&nd, dev_name);
+		retval = do_move_mount(&path, dev_name);
 	else
-		retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
 dput_out:
-	path_put(&nd.path);
+	path_put(&path);
 	return retval;
 }
 
-- 
cgit v1.2.3


From 0a0d8a46757e2063433c8cd52b7d654e02b4682b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 00:55:27 -0400
Subject: [PATCH] no need for noinline stuff in fs/namespace.c anymore

Stack footprint from hell had been due to many struct nameidata in there.
No more.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 9f6005e5586..f527a0d6c64 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1425,9 +1425,8 @@ out_unlock:
 
 /*
  * recursively change the type of the mountpoint.
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_change_type(struct path *path, int flag)
+static int do_change_type(struct path *path, int flag)
 {
 	struct vfsmount *m, *mnt = path->mnt;
 	int recurse = flag & MS_REC;
@@ -1459,9 +1458,8 @@ static noinline int do_change_type(struct path *path, int flag)
 
 /*
  * do loopback mount.
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_loopback(struct path *path, char *old_name,
+static int do_loopback(struct path *path, char *old_name,
 				int recurse)
 {
 	struct path old_path;
@@ -1528,9 +1526,8 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_remount(struct path *path, int flags, int mnt_flags,
+static int do_remount(struct path *path, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
@@ -1568,10 +1565,7 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)
 	return 0;
 }
 
-/*
- * noinline this do_mount helper to save do_mount stack space.
- */
-static noinline int do_move_mount(struct path *path, char *old_name)
+static int do_move_mount(struct path *path, char *old_name)
 {
 	struct path old_path, parent_path;
 	struct vfsmount *p;
@@ -1648,9 +1642,8 @@ out:
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_new_mount(struct path *path, char *type, int flags,
+static int do_new_mount(struct path *path, char *type, int flags,
 			int mnt_flags, char *name, void *data)
 {
 	struct vfsmount *mnt;
-- 
cgit v1.2.3


From 8264613def2e5c4f12bc3167713090fd172e6055 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 00:57:06 -0400
Subject: [PATCH] switch quota_on-related stuff to kern_path()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dquot.c          | 10 +++++-----
 fs/ext3/super.c     | 22 +++++++++++-----------
 fs/ext4/super.c     | 24 ++++++++++++------------
 fs/reiserfs/super.c | 18 +++++++++---------
 4 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index da30a27f224..5e95261005b 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1805,19 +1805,19 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 }
 
 /* Actual function called from quotactl() */
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path,
+int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
 	if (remount)
 		return vfs_quota_on_remount(sb, type);
 
-	error = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	error = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (!error) {
-		error = vfs_quota_on_path(sb, type, format_id, &nd.path);
-		path_put(&nd.path);
+		error = vfs_quota_on_path(sb, type, format_id, &path);
+		path_put(&path);
 	}
 	return error;
 }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3a260af5544..5b7fee10566 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2783,30 +2783,30 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 char *path, int remount)
+			 char *name, int remount)
 {
 	int err;
-	struct nameidata nd;
+	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* When remounting, no checks are needed and in fact, path is NULL */
+	/* When remounting, no checks are needed and in fact, name is NULL */
 	if (remount)
-		return vfs_quota_on(sb, type, format_id, path, remount);
+		return vfs_quota_on(sb, type, format_id, name, remount);
 
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
 
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
-		path_put(&nd.path);
+	if (path.mnt->mnt_sb != sb) {
+		path_put(&path);
 		return -EXDEV;
 	}
 	/* Journaling quota? */
 	if (EXT3_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+		if (path.dentry->d_parent != sb->s_root)
 			printk(KERN_WARNING
 				"EXT3-fs: Quota file not on filesystem root. "
 				"Journaled quota will not work.\n");
@@ -2816,7 +2816,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext3_should_journal_data(nd.path.dentry->d_inode)) {
+	if (ext3_should_journal_data(path.dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -2826,8 +2826,8 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
 	}
 
-	err = vfs_quota_on_path(sb, type, format_id, &nd.path);
-	path_put(&nd.path);
+	err = vfs_quota_on_path(sb, type, format_id, &path);
+	path_put(&path);
 	return err;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9b2b2bc4ec1..ae35f176b69 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3328,30 +3328,30 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *path, int remount)
+			 char *name, int remount)
 {
 	int err;
-	struct nameidata nd;
+	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* When remounting, no checks are needed and in fact, path is NULL */
+	/* When remounting, no checks are needed and in fact, name is NULL */
 	if (remount)
-		return vfs_quota_on(sb, type, format_id, path, remount);
+		return vfs_quota_on(sb, type, format_id, name, remount);
 
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
 
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
-		path_put(&nd.path);
+	if (path.mnt->mnt_sb != sb) {
+		path_put(&path);
 		return -EXDEV;
 	}
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
-		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+		if (path.dentry->d_parent != sb->s_root)
 			printk(KERN_WARNING
 				"EXT4-fs: Quota file not on filesystem root. "
 				"Journaled quota will not work.\n");
@@ -3361,7 +3361,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext4_should_journal_data(nd.path.dentry->d_inode)) {
+	if (ext4_should_journal_data(path.dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -3370,13 +3370,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 		if (err) {
-			path_put(&nd.path);
+			path_put(&path);
 			return err;
 		}
 	}
 
-	err = vfs_quota_on_path(sb, type, format_id, &nd.path);
-	path_put(&nd.path);
+	err = vfs_quota_on_path(sb, type, format_id, &path);
+	path_put(&path);
 	return err;
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index d318c7e663f..663a91f5dce 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2058,10 +2058,10 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     char *path, int remount)
+			     char *name, int remount)
 {
 	int err;
-	struct nameidata nd;
+	struct path path;
 	struct inode *inode;
 	struct reiserfs_transaction_handle th;
 
@@ -2069,16 +2069,16 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		return -EINVAL;
 	/* No more checks needed? Path and format_id are bogus anyway... */
 	if (remount)
-		return vfs_quota_on(sb, type, format_id, path, 1);
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+		return vfs_quota_on(sb, type, format_id, name, 1);
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
+	if (path.mnt->mnt_sb != sb) {
 		err = -EXDEV;
 		goto out;
 	}
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 	/* We must not pack tails for quota files on reiserfs for quota IO to work */
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
@@ -2094,7 +2094,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	/* Journaling quota? */
 	if (REISERFS_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not of fs root? */
-		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+		if (path.dentry->d_parent != sb->s_root)
 			reiserfs_warning(sb,
 				 "reiserfs: Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
@@ -2113,9 +2113,9 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			goto out;
 	}
-	err = vfs_quota_on_path(sb, type, format_id, &nd.path);
+	err = vfs_quota_on_path(sb, type, format_id, &path);
 out:
-	path_put(&nd.path);
+	path_put(&path);
 	return err;
 }
 
-- 
cgit v1.2.3


From c1a2a4756df01d72b6f7a0563218b23b63a6d4ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 01:01:02 -0400
Subject: [PATCH] sanitize svc_export_parse()

clean up the exit paths, get rid of nameidata

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c | 77 ++++++++++++++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9dc036f1835..2fa61f003ff 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -500,35 +500,22 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	int len;
 	int err;
 	struct auth_domain *dom = NULL;
-	struct nameidata nd;
-	struct svc_export exp, *expp;
+	struct svc_export exp = {}, *expp;
 	int an_int;
 
-	nd.path.dentry = NULL;
-	exp.ex_pathname = NULL;
-
-	/* fs locations */
-	exp.ex_fslocs.locations = NULL;
-	exp.ex_fslocs.locations_count = 0;
-	exp.ex_fslocs.migrated = 0;
-
-	exp.ex_uuid = NULL;
-
-	/* secinfo */
-	exp.ex_nflavors = 0;
-
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
 	mesg[mlen-1] = 0;
 
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!buf) goto out;
+	if (!buf)
+		return -ENOMEM;
 
 	/* client */
-	len = qword_get(&mesg, buf, PAGE_SIZE);
 	err = -EINVAL;
-	if (len <= 0) goto out;
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len <= 0)
+		goto out;
 
 	err = -ENOENT;
 	dom = auth_domain_find(buf);
@@ -537,25 +524,25 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* path */
 	err = -EINVAL;
-	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
-		goto out;
-	err = path_lookup(buf, 0, &nd);
-	if (err) goto out_no_path;
+	if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out1;
+
+	err = kern_path(buf, 0, &exp.ex_path);
+	if (err)
+		goto out1;
 
-	exp.h.flags = 0;
 	exp.ex_client = dom;
-	exp.ex_path.mnt = nd.path.mnt;
-	exp.ex_path.dentry = nd.path.dentry;
-	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
+
 	err = -ENOMEM;
+	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
 	if (!exp.ex_pathname)
-		goto out;
+		goto out2;
 
 	/* expiry */
 	err = -EINVAL;
 	exp.h.expiry_time = get_expiry(&mesg);
 	if (exp.h.expiry_time == 0)
-		goto out;
+		goto out3;
 
 	/* flags */
 	err = get_int(&mesg, &an_int);
@@ -563,22 +550,26 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = 0;
 		set_bit(CACHE_NEGATIVE, &exp.h.flags);
 	} else {
-		if (err || an_int < 0) goto out;	
+		if (err || an_int < 0)
+			goto out3;
 		exp.ex_flags= an_int;
 	
 		/* anon uid */
 		err = get_int(&mesg, &an_int);
-		if (err) goto out;
+		if (err)
+			goto out3;
 		exp.ex_anon_uid= an_int;
 
 		/* anon gid */
 		err = get_int(&mesg, &an_int);
-		if (err) goto out;
+		if (err)
+			goto out3;
 		exp.ex_anon_gid= an_int;
 
 		/* fsid */
 		err = get_int(&mesg, &an_int);
-		if (err) goto out;
+		if (err)
+			goto out3;
 		exp.ex_fsid = an_int;
 
 		while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
@@ -604,12 +595,13 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				 */
 				break;
 			if (err)
-				goto out;
+				goto out4;
 		}
 
-		err = check_export(nd.path.dentry->d_inode, exp.ex_flags,
+		err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags,
 				   exp.ex_uuid);
-		if (err) goto out;
+		if (err)
+			goto out4;
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -622,15 +614,16 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = -ENOMEM;
 	else
 		exp_put(expp);
- out:
+out4:
 	nfsd4_fslocs_free(&exp.ex_fslocs);
 	kfree(exp.ex_uuid);
+out3:
 	kfree(exp.ex_pathname);
-	if (nd.path.dentry)
-		path_put(&nd.path);
- out_no_path:
-	if (dom)
-		auth_domain_put(dom);
+out2:
+	path_put(&exp.ex_path);
+out1:
+	auth_domain_put(dom);
+out:
 	kfree(buf);
 	return err;
 }
-- 
cgit v1.2.3


From a63bb99660d82dfe7c51588e1f9aadefb756ba51 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 01:03:36 -0400
Subject: [PATCH] switch nfsd to kern_path()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c      | 48 +++++++++++++++++++++++-------------------------
 fs/nfsd/nfs4recover.c | 50 +++++++++++++++++++++++++-------------------------
 fs/nfsd/nfs4state.c   |  8 ++++----
 fs/nfsd/nfsctl.c      |  8 ++++----
 4 files changed, 56 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 2fa61f003ff..676201dbdf8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -162,20 +162,18 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 			cache_put(&ek->h, &svc_expkey_cache);
 		else err = -ENOMEM;
 	} else {
-		struct nameidata nd;
-		err = path_lookup(buf, 0, &nd);
+		err = kern_path(buf, 0, &key.ek_path);
 		if (err)
 			goto out;
 
 		dprintk("Found the path %s\n", buf);
-		key.ek_path = nd.path;
 
 		ek = svc_expkey_update(&key, ek);
 		if (ek)
 			cache_put(&ek->h, &svc_expkey_cache);
 		else
 			err = -ENOMEM;
-		path_put(&nd.path);
+		path_put(&key.ek_path);
 	}
 	cache_flush();
  out:
@@ -991,7 +989,7 @@ exp_export(struct nfsctl_export *nxp)
 	struct svc_export	*exp = NULL;
 	struct svc_export	new;
 	struct svc_expkey	*fsid_key = NULL;
-	struct nameidata nd;
+	struct path path;
 	int		err;
 
 	/* Consistency check */
@@ -1014,12 +1012,12 @@ exp_export(struct nfsctl_export *nxp)
 
 
 	/* Look up the dentry */
-	err = path_lookup(nxp->ex_path, 0, &nd);
+	err = kern_path(nxp->ex_path, 0, &path);
 	if (err)
 		goto out_put_clp;
 	err = -EINVAL;
 
-	exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL);
+	exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
 
 	memset(&new, 0, sizeof(new));
 
@@ -1027,8 +1025,8 @@ exp_export(struct nfsctl_export *nxp)
 	if ((nxp->ex_flags & NFSEXP_FSID) &&
 	    (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
 	    fsid_key->ek_path.mnt &&
-	    (fsid_key->ek_path.mnt != nd.path.mnt ||
-	     fsid_key->ek_path.dentry != nd.path.dentry))
+	    (fsid_key->ek_path.mnt != path.mnt ||
+	     fsid_key->ek_path.dentry != path.dentry))
 		goto finish;
 
 	if (!IS_ERR(exp)) {
@@ -1044,7 +1042,7 @@ exp_export(struct nfsctl_export *nxp)
 		goto finish;
 	}
 
-	err = check_export(nd.path.dentry->d_inode, nxp->ex_flags, NULL);
+	err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL);
 	if (err) goto finish;
 
 	err = -ENOMEM;
@@ -1057,7 +1055,7 @@ exp_export(struct nfsctl_export *nxp)
 	if (!new.ex_pathname)
 		goto finish;
 	new.ex_client = clp;
-	new.ex_path = nd.path;
+	new.ex_path = path;
 	new.ex_flags = nxp->ex_flags;
 	new.ex_anon_uid = nxp->ex_anon_uid;
 	new.ex_anon_gid = nxp->ex_anon_gid;
@@ -1083,7 +1081,7 @@ finish:
 		exp_put(exp);
 	if (fsid_key && !IS_ERR(fsid_key))
 		cache_put(&fsid_key->h, &svc_expkey_cache);
-	path_put(&nd.path);
+	path_put(&path);
 out_put_clp:
 	auth_domain_put(clp);
 out_unlock:
@@ -1114,7 +1112,7 @@ exp_unexport(struct nfsctl_export *nxp)
 {
 	struct auth_domain *dom;
 	svc_export *exp;
-	struct nameidata nd;
+	struct path path;
 	int		err;
 
 	/* Consistency check */
@@ -1131,13 +1129,13 @@ exp_unexport(struct nfsctl_export *nxp)
 		goto out_unlock;
 	}
 
-	err = path_lookup(nxp->ex_path, 0, &nd);
+	err = kern_path(nxp->ex_path, 0, &path);
 	if (err)
 		goto out_domain;
 
 	err = -EINVAL;
-	exp = exp_get_by_name(dom, nd.path.mnt, nd.path.dentry, NULL);
-	path_put(&nd.path);
+	exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+	path_put(&path);
 	if (IS_ERR(exp))
 		goto out_domain;
 
@@ -1159,26 +1157,26 @@ out_unlock:
  * since its harder to fool a kernel module than a user space program.
  */
 int
-exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
+exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
 {
 	struct svc_export	*exp;
-	struct nameidata	nd;
+	struct path		path;
 	struct inode		*inode;
 	struct svc_fh		fh;
 	int			err;
 
 	err = -EPERM;
 	/* NB: we probably ought to check that it's NUL-terminated */
-	if (path_lookup(path, 0, &nd)) {
-		printk("nfsd: exp_rootfh path not found %s", path);
+	if (kern_path(name, 0, &path)) {
+		printk("nfsd: exp_rootfh path not found %s", name);
 		return err;
 	}
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
 	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
-		 path, nd.path.dentry, clp->name,
+		 name, path.dentry, clp->name,
 		 inode->i_sb->s_id, inode->i_ino);
-	exp = exp_parent(clp, nd.path.mnt, nd.path.dentry, NULL);
+	exp = exp_parent(clp, path.mnt, path.dentry, NULL);
 	if (IS_ERR(exp)) {
 		err = PTR_ERR(exp);
 		goto out;
@@ -1188,7 +1186,7 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
 	 * fh must be initialized before calling fh_compose
 	 */
 	fh_init(&fh, maxsize);
-	if (fh_compose(&fh, exp, nd.path.dentry, NULL))
+	if (fh_compose(&fh, exp, path.dentry, NULL))
 		err = -EINVAL;
 	else
 		err = 0;
@@ -1196,7 +1194,7 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
 	fh_put(&fh);
 	exp_put(exp);
 out:
-	path_put(&nd.path);
+	path_put(&path);
 	return err;
 }
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 145b3c877a2..bb93946ace2 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -51,7 +51,7 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
 /* Globals */
-static struct nameidata rec_dir;
+static struct path rec_dir;
 static int rec_dir_init = 0;
 
 static void
@@ -121,9 +121,9 @@ out_no_tfm:
 static void
 nfsd4_sync_rec_dir(void)
 {
-	mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex);
-	nfsd_sync_dir(rec_dir.path.dentry);
-	mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+	nfsd_sync_dir(rec_dir.dentry);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 }
 
 int
@@ -143,9 +143,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	nfs4_save_user(&uid, &gid);
 
 	/* lock the parent */
-	mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 
-	dentry = lookup_one_len(dname, rec_dir.path.dentry, HEXDIR_LEN-1);
+	dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		goto out_unlock;
@@ -155,15 +155,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
 		goto out_put;
 	}
-	status = mnt_want_write(rec_dir.path.mnt);
+	status = mnt_want_write(rec_dir.mnt);
 	if (status)
 		goto out_put;
-	status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
-	mnt_drop_write(rec_dir.path.mnt);
+	status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+	mnt_drop_write(rec_dir.mnt);
 out_put:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (status == 0) {
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
@@ -226,7 +226,7 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 
 	nfs4_save_user(&uid, &gid);
 
-	filp = dentry_open(dget(dir), mntget(rec_dir.path.mnt), O_RDONLY);
+	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
@@ -291,9 +291,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
-	mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex);
-	dentry = lookup_one_len(name, rec_dir.path.dentry, namlen);
-	mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		return status;
@@ -302,7 +302,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 	if (!dentry->d_inode)
 		goto out;
 
-	status = nfsd4_clear_clid_dir(rec_dir.path.dentry, dentry);
+	status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
 out:
 	dput(dentry);
 	return status;
@@ -318,7 +318,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	if (!rec_dir_init || !clp->cl_firststate)
 		return;
 
-	status = mnt_want_write(rec_dir.path.mnt);
+	status = mnt_want_write(rec_dir.mnt);
 	if (status)
 		goto out;
 	clp->cl_firststate = 0;
@@ -327,7 +327,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	nfs4_reset_user(uid, gid);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
-	mnt_drop_write(rec_dir.path.mnt);
+	mnt_drop_write(rec_dir.mnt);
 out:
 	if (status)
 		printk("NFSD: Failed to remove expired client state directory"
@@ -357,17 +357,17 @@ nfsd4_recdir_purge_old(void) {
 
 	if (!rec_dir_init)
 		return;
-	status = mnt_want_write(rec_dir.path.mnt);
+	status = mnt_want_write(rec_dir.mnt);
 	if (status)
 		goto out;
-	status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
+	status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
-	mnt_drop_write(rec_dir.path.mnt);
+	mnt_drop_write(rec_dir.mnt);
 out:
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
-			" directory %s\n", rec_dir.path.dentry->d_name.name);
+			" directory %s\n", rec_dir.dentry->d_name.name);
 }
 
 static int
@@ -387,10 +387,10 @@ int
 nfsd4_recdir_load(void) {
 	int status;
 
-	status = nfsd4_list_rec_dir(rec_dir.path.dentry, load_recdir);
+	status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
 	if (status)
 		printk("nfsd4: failed loading clients from recovery"
-			" directory %s\n", rec_dir.path.dentry->d_name.name);
+			" directory %s\n", rec_dir.dentry->d_name.name);
 	return status;
 }
 
@@ -412,7 +412,7 @@ nfsd4_init_recdir(char *rec_dirname)
 
 	nfs4_save_user(&uid, &gid);
 
-	status = path_lookup(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+	status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			&rec_dir);
 	if (status)
 		printk("NFSD: unable to find recovery directory %s\n",
@@ -429,5 +429,5 @@ nfsd4_shutdown_recdir(void)
 	if (!rec_dir_init)
 		return;
 	rec_dir_init = 0;
-	path_put(&rec_dir.path);
+	path_put(&rec_dir);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0cc7ff5d5ab..b0bebc552a1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3284,17 +3284,17 @@ int
 nfs4_reset_recoverydir(char *recdir)
 {
 	int status;
-	struct nameidata nd;
+	struct path path;
 
-	status = path_lookup(recdir, LOOKUP_FOLLOW, &nd);
+	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
 	if (status)
 		return status;
 	status = -ENOTDIR;
-	if (S_ISDIR(nd.path.dentry->d_inode->i_mode)) {
+	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
 		nfs4_set_recdir(recdir);
 		status = 0;
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return status;
 }
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 97543df5824..e3f9783fdcf 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -341,7 +341,7 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 
 static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	char *fo_path;
 	int error;
 
@@ -356,13 +356,13 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
 	if (qword_get(&buf, fo_path, size) < 0)
 		return -EINVAL;
 
-	error = path_lookup(fo_path, 0, &nd);
+	error = kern_path(fo_path, 0, &path);
 	if (error)
 		return error;
 
-	error = nlmsvc_unlock_all_by_sb(nd.path.mnt->mnt_sb);
+	error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
 
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
-- 
cgit v1.2.3


From 421748ecde8e69a6364e5ae66eb3bf87e1f995c0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 01:04:36 -0400
Subject: [PATCH] assorted path_lookup() -> kern_path() conversions

more nameidata eviction

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c        | 14 +++++++-------
 fs/configfs/symlink.c | 16 ++++++++--------
 fs/ecryptfs/main.c    | 23 +++++++++--------------
 3 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 218408eed1b..d06fe3c3dd3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1268,33 +1268,33 @@ EXPORT_SYMBOL(ioctl_by_bdev);
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */
-struct block_device *lookup_bdev(const char *path)
+struct block_device *lookup_bdev(const char *pathname)
 {
 	struct block_device *bdev;
 	struct inode *inode;
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	if (!path || !*path)
+	if (!pathname || !*pathname)
 		return ERR_PTR(-EINVAL);
 
-	error = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
 	if (error)
 		return ERR_PTR(error);
 
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 	error = -ENOTBLK;
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
-	if (nd.path.mnt->mnt_flags & MNT_NODEV)
+	if (path.mnt->mnt_flags & MNT_NODEV)
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
 	if (!bdev)
 		goto fail;
 out:
-	path_put(&nd.path);
+	path_put(&path);
 	return bdev;
 fail:
 	bdev = ERR_PTR(error);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index bf74973b049..932a92b3148 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -108,18 +108,18 @@ out:
 }
 
 
-static int get_target(const char *symname, struct nameidata *nd,
+static int get_target(const char *symname, struct path *path,
 		      struct config_item **target)
 {
 	int ret;
 
-	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
 	if (!ret) {
-		if (nd->path.dentry->d_sb == configfs_sb) {
-			*target = configfs_get_config_item(nd->path.dentry);
+		if (path->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(path->dentry);
 			if (!*target) {
 				ret = -ENOENT;
-				path_put(&nd->path);
+				path_put(path);
 			}
 		} else
 			ret = -EPERM;
@@ -132,7 +132,7 @@ static int get_target(const char *symname, struct nameidata *nd,
 int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
 	int ret;
-	struct nameidata nd;
+	struct path path;
 	struct configfs_dirent *sd;
 	struct config_item *parent_item;
 	struct config_item *target_item;
@@ -159,7 +159,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	    !type->ct_item_ops->allow_link)
 		goto out_put;
 
-	ret = get_target(symname, &nd, &target_item);
+	ret = get_target(symname, &path, &target_item);
 	if (ret)
 		goto out_put;
 
@@ -174,7 +174,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	}
 
 	config_item_put(target_item);
-	path_put(&nd.path);
+	path_put(&path);
 
 out_put:
 	config_item_put(parent_item);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 046e027a4cb..64d2ba980df 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -471,31 +471,26 @@ out:
  */
 static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
 {
+	struct path path;
 	int rc;
-	struct nameidata nd;
-	struct dentry *lower_root;
-	struct vfsmount *lower_mnt;
 
-	memset(&nd, 0, sizeof(struct nameidata));
-	rc = path_lookup(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+	rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
 		goto out;
 	}
-	lower_root = nd.path.dentry;
-	lower_mnt = nd.path.mnt;
-	ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
-	sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
-	sb->s_blocksize = lower_root->d_sb->s_blocksize;
-	ecryptfs_set_dentry_lower(sb->s_root, lower_root);
-	ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
-	rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0);
+	ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
+	sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+	sb->s_blocksize = path.dentry->d_sb->s_blocksize;
+	ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
+	ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
+	rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
 	if (rc)
 		goto out_free;
 	rc = 0;
 	goto out;
 out_free:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From 8737f3a1b3c6a38a2a064552d4536633a5a16cd3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 Aug 2008 22:36:57 -0400
Subject: [PATCH] get rid of path_lookup_create()

... and don't pass bogus flags when we are just looking for parent.
Fold __path_lookup_intent_open() into path_lookup_open() while we
are at it; that's the only remaining caller.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 61 ++++++++++++++++++++++---------------------------------------
 1 file changed, 22 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 4a56f9b59e8..e584f04745b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1147,9 +1147,16 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 
 }
 
-static int __path_lookup_intent_open(int dfd, const char *name,
-		unsigned int lookup_flags, struct nameidata *nd,
-		int open_flags, int create_mode)
+/**
+ * path_lookup_open - lookup a file path with open intent
+ * @dfd: the directory to use as base, or AT_FDCWD
+ * @name: pointer to file name
+ * @lookup_flags: lookup intent flags
+ * @nd: pointer to nameidata
+ * @open_flags: open intent flags
+ */
+int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
+		struct nameidata *nd, int open_flags)
 {
 	struct file *filp = get_empty_filp();
 	int err;
@@ -1158,7 +1165,7 @@ static int __path_lookup_intent_open(int dfd, const char *name,
 		return -ENFILE;
 	nd->intent.open.file = filp;
 	nd->intent.open.flags = open_flags;
-	nd->intent.open.create_mode = create_mode;
+	nd->intent.open.create_mode = 0;
 	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
 	if (IS_ERR(nd->intent.open.file)) {
 		if (err == 0) {
@@ -1170,38 +1177,6 @@ static int __path_lookup_intent_open(int dfd, const char *name,
 	return err;
 }
 
-/**
- * path_lookup_open - lookup a file path with open intent
- * @dfd: the directory to use as base, or AT_FDCWD
- * @name: pointer to file name
- * @lookup_flags: lookup intent flags
- * @nd: pointer to nameidata
- * @open_flags: open intent flags
- */
-int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags)
-{
-	return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
-			open_flags, 0);
-}
-
-/**
- * path_lookup_create - lookup a file path with open + create intent
- * @dfd: the directory to use as base, or AT_FDCWD
- * @name: pointer to file name
- * @lookup_flags: lookup intent flags
- * @nd: pointer to nameidata
- * @open_flags: open intent flags
- * @create_mode: create intent flags
- */
-static int path_lookup_create(int dfd, const char *name,
-			      unsigned int lookup_flags, struct nameidata *nd,
-			      int open_flags, int create_mode)
-{
-	return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
-			nd, open_flags, create_mode);
-}
-
 static struct dentry *__lookup_hash(struct qstr *name,
 		struct dentry *base, struct nameidata *nd)
 {
@@ -1711,8 +1686,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
-				   &nd, flag, mode);
+	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
 	if (error)
 		return ERR_PTR(error);
 
@@ -1723,10 +1697,18 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	 */
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
-		goto exit;
+		goto exit_parent;
 
+	error = -ENFILE;
+	filp = get_empty_filp();
+	if (filp == NULL)
+		goto exit_parent;
+	nd.intent.open.file = filp;
+	nd.intent.open.flags = flag;
+	nd.intent.open.create_mode = mode;
 	dir = nd.path.dentry;
 	nd.flags &= ~LOOKUP_PARENT;
+	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
 	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
@@ -1831,6 +1813,7 @@ exit_dput:
 exit:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
+exit_parent:
 	path_put(&nd.path);
 	return ERR_PTR(error);
 
-- 
cgit v1.2.3


From 3516586a424ea5727be089da6541cbd5644f0497 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 Aug 2008 03:00:49 -0400
Subject: [PATCH] make O_EXCL in nd->intent.flags visible in nd->flags

New flag: LOOKUP_EXCL.  Set before doing the final step of pathname
resolution on the paths that have LOOKUP_CREATE and O_EXCL.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/ops_inode.c | 2 +-
 fs/namei.c          | 4 +++-
 fs/nfs/dir.c        | 6 ++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 534e1e2c65c..d232991b904 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -69,7 +69,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 			mark_inode_dirty(inode);
 			break;
 		} else if (PTR_ERR(inode) != -EEXIST ||
-			   (nd && (nd->intent.open.flags & O_EXCL))) {
+			   (nd && nd->flags & LOOKUP_EXCL)) {
 			gfs2_holder_uninit(ghs);
 			return PTR_ERR(inode);
 		}
diff --git a/fs/namei.c b/fs/namei.c
index e584f04745b..2b8f823eda4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1709,6 +1709,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	dir = nd.path.dentry;
 	nd.flags &= ~LOOKUP_PARENT;
 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
+	if (flag & O_EXCL)
+		nd.flags |= LOOKUP_EXCL;
 	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
@@ -1906,7 +1908,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	if (nd->last_type != LAST_NORM)
 		goto fail;
 	nd->flags &= ~LOOKUP_PARENT;
-	nd->flags |= LOOKUP_CREATE;
+	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
 	nd->intent.open.flags = O_EXCL;
 
 	/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index efdba2e802d..c216c8786c5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -707,9 +707,7 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 {
 	if (NFS_PROTO(dir)->version == 2)
 		return 0;
-	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-		return 0;
-	return (nd->intent.open.flags & O_EXCL) != 0;
+	return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL);
 }
 
 /*
@@ -1009,7 +1007,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 
 	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
 	 * the dentry. */
-	if (nd->intent.open.flags & O_EXCL) {
+	if (nd->flags & LOOKUP_EXCL) {
 		d_instantiate(dentry, NULL);
 		goto out;
 	}
-- 
cgit v1.2.3


From ca30bc99527ab968707bafc09e38807de7e70c4a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 00:27:59 +0200
Subject: [PATCH] hpfs: cleanup ->setattr

Reformat hpfs_notify_change to standard kernel style to make it readable
and rename it to hpfs_setattr as that's what the method is called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hpfs/file.c    |  2 +-
 fs/hpfs/hpfs_fn.h |  2 +-
 fs/hpfs/inode.c   | 29 +++++++++++++++++++----------
 fs/hpfs/namei.c   |  2 +-
 4 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index be8be5040e0..64ab5225920 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -143,5 +143,5 @@ const struct file_operations hpfs_file_ops =
 const struct inode_operations hpfs_file_iops =
 {
 	.truncate	= hpfs_truncate,
-	.setattr	= hpfs_notify_change,
+	.setattr	= hpfs_setattr,
 };
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 42ff60ccf2a..c2ea31bae31 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -275,7 +275,7 @@ void hpfs_init_inode(struct inode *);
 void hpfs_read_inode(struct inode *);
 void hpfs_write_inode(struct inode *);
 void hpfs_write_inode_nolock(struct inode *);
-int hpfs_notify_change(struct dentry *, struct iattr *);
+int hpfs_setattr(struct dentry *, struct iattr *);
 void hpfs_write_if_changed(struct inode *);
 void hpfs_delete_inode(struct inode *);
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 85d3e1d9ac0..39a1bfbea31 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -260,19 +260,28 @@ void hpfs_write_inode_nolock(struct inode *i)
 	brelse(bh);
 }
 
-int hpfs_notify_change(struct dentry *dentry, struct iattr *attr)
+int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
-	int error=0;
+	int error = -EINVAL;
+
 	lock_kernel();
-	if ( ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) ||
-	     (hpfs_sb(inode->i_sb)->sb_root == inode->i_ino) ) {
-		error = -EINVAL;
-	} else if ((error = inode_change_ok(inode, attr))) {
-	} else if ((error = inode_setattr(inode, attr))) {
-	} else {
-		hpfs_write_inode(inode);
-	}
+	if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
+		goto out_unlock;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out_unlock;
+
+	error = inode_setattr(inode, attr);
+	if (error)
+		goto out_unlock;
+
+	hpfs_write_inode(inode);
+
+ out_unlock:
 	unlock_kernel();
 	return error;
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d9c59a77544..10783f3d265 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -669,5 +669,5 @@ const struct inode_operations hpfs_dir_iops =
 	.rmdir		= hpfs_rmdir,
 	.mknod		= hpfs_mknod,
 	.rename		= hpfs_rename,
-	.setattr	= hpfs_notify_change,
+	.setattr	= hpfs_setattr,
 };
-- 
cgit v1.2.3


From a518ab9329041411526ab8e05edfda7e2715244f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:34:22 +0200
Subject: [PATCH] tidy up chrdev_open

Use a single goto label for chrdev_put + return error cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/char_dev.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index 262fa10e213..700697a7261 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -386,15 +386,22 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	cdev_put(new);
 	if (ret)
 		return ret;
+
+	ret = -ENXIO;
 	filp->f_op = fops_get(p->ops);
-	if (!filp->f_op) {
-		cdev_put(p);
-		return -ENXIO;
-	}
-	if (filp->f_op->open)
+	if (!filp->f_op)
+		goto out_cdev_put;
+
+	if (filp->f_op->open) {
 		ret = filp->f_op->open(inode,filp);
-	if (ret)
-		cdev_put(p);
+		if (ret)
+			goto out_cdev_put;
+	}
+
+	return 0;
+
+ out_cdev_put:
+	cdev_put(p);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3a8cff4f026c0b98bee6291eb28d4df42feb76dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:37:17 +0200
Subject: [PATCH] generic_file_llseek tidyups

Add kerneldoc for generic_file_llseek and generic_file_llseek_unlocked,
use sane variable names and unclutter the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c | 58 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 9ba495d5a29..969a6d9c020 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,39 +31,61 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
+/**
+ * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * Updates the file offset to the value specified by @offset and @origin.
+ * Locking must be provided by the caller.
+ */
 loff_t
 generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
-	loff_t retval;
 	struct inode *inode = file->f_mapping->host;
 
 	switch (origin) {
-		case SEEK_END:
-			offset += inode->i_size;
-			break;
-		case SEEK_CUR:
-			offset += file->f_pos;
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+		break;
 	}
-	retval = -EINVAL;
-	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
-		/* Special lock needed here? */
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
+
+	if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+		return -EINVAL;
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
 	}
-	return retval;
+
+	return offset;
 }
 EXPORT_SYMBOL(generic_file_llseek_unlocked);
 
+/**
+ * generic_file_llseek - generic llseek implementation for regular files
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is a generic implemenation of ->llseek useable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @origin under i_mutex.
+ */
 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	loff_t n;
+	loff_t rval;
+
 	mutex_lock(&file->f_dentry->d_inode->i_mutex);
-	n = generic_file_llseek_unlocked(file, offset, origin);
+	rval = generic_file_llseek_unlocked(file, offset, origin);
 	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-	return n;
+
+	return rval;
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
-- 
cgit v1.2.3


From 4ea3ada2955e4519befa98ff55dd62d6dfbd1705 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:48:57 +0200
Subject: [PATCH] new helper: d_obtain_alias

The calling conventions of d_alloc_anon are rather unfortunate for all
users, and it's name is not very descriptive either.

Add d_obtain_alias as a new exported helper that drops the inode
reference in the failure case, too and allows to pass-through NULL
pointers and inodes to allow for tail-calls in the export operations.

Incidentally this helper already existed as a private function in
libfs.c as exportfs_d_alloc so kill that one and switch the callers
to d_obtain_alias.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 35 +++++++++++++++++++++++++++++++++++
 fs/libfs.c  | 26 ++------------------------
 2 files changed, 37 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index e7a1a99b746..46fc7820678 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1174,6 +1174,41 @@ struct dentry * d_alloc_anon(struct inode *inode)
 	return res;
 }
 
+/**
+ * d_obtain_alias - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
+ *
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  If %NULL is returned (indicating kmalloc failure),
+ * the reference on the inode has been released.  To make it easier
+ * to use in export operations a NULL or IS_ERR inode may be passed in
+ * and will be casted to the corresponding NULL or IS_ERR dentry.
+ */
+struct dentry *d_obtain_alias(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	if (!inode)
+		return NULL;
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	dentry = d_alloc_anon(inode);
+	if (!dentry) {
+		iput(inode);
+		dentry = ERR_PTR(-ENOMEM);
+	}
+	return dentry;
+}
+EXPORT_SYMBOL_GPL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
diff --git a/fs/libfs.c b/fs/libfs.c
index 1add676a19d..74688598bcf 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -732,28 +732,6 @@ out:
 	return ret;
 }
 
-/*
- * This is what d_alloc_anon should have been.  Once the exportfs
- * argument transition has been finished I will update d_alloc_anon
- * to this prototype and this wrapper will go away.   --hch
- */
-static struct dentry *exportfs_d_alloc(struct inode *inode)
-{
-	struct dentry *dentry;
-
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_PTR(PTR_ERR(inode));
-
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		dentry = ERR_PTR(-ENOMEM);
-	}
-	return dentry;
-}
-
 /**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
  * @sb:		filesystem to do the file handle conversion on
@@ -782,7 +760,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	return exportfs_d_alloc(inode);
+	return d_obtain_alias(inode);
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
 
@@ -815,7 +793,7 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	return exportfs_d_alloc(inode);
+	return d_obtain_alias(inode);
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
-- 
cgit v1.2.3


From 440037287c5ebb07033ab927ca16bb68c291d309 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:49:04 +0200
Subject: [PATCH] switch all filesystems over to d_obtain_alias

Switch all users of d_alloc_anon to d_obtain_alias.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c                   | 10 ++++-----
 fs/efs/namei.c                | 29 ++++--------------------
 fs/exportfs/expfs.c           |  4 ----
 fs/ext2/namei.c               | 13 +----------
 fs/ext3/namei.c               | 14 +-----------
 fs/ext4/namei.c               | 11 +--------
 fs/fat/inode.c                | 52 +++++++++++++++----------------------------
 fs/fuse/inode.c               | 23 +++++++------------
 fs/gfs2/ops_export.c          | 33 ++++++++-------------------
 fs/isofs/export.c             | 33 +++++----------------------
 fs/jfs/namei.c                | 15 +------------
 fs/nfs/getroot.c              | 14 +++++-------
 fs/ntfs/namei.c               | 22 ++----------------
 fs/ocfs2/export.c             | 30 +++++--------------------
 fs/reiserfs/inode.c           | 13 ++---------
 fs/reiserfs/namei.c           | 11 +--------
 fs/udf/namei.c                | 17 ++------------
 fs/xfs/linux-2.6/xfs_export.c | 32 +++-----------------------
 fs/xfs/linux-2.6/xfs_ioctl.c  |  7 +++---
 19 files changed, 78 insertions(+), 305 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 46fc7820678..d45ff7f5ecc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1187,17 +1187,17 @@ struct dentry * d_alloc_anon(struct inode *inode)
  * allocating a new one.
  *
  * On successful return, the reference to the inode has been transferred
- * to the dentry.  If %NULL is returned (indicating kmalloc failure),
- * the reference on the inode has been released.  To make it easier
- * to use in export operations a NULL or IS_ERR inode may be passed in
- * and will be casted to the corresponding NULL or IS_ERR dentry.
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and will be the error will be propagate to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
  */
 struct dentry *d_obtain_alias(struct inode *inode)
 {
 	struct dentry *dentry;
 
 	if (!inode)
-		return NULL;
+		return ERR_PTR(-ESTALE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 291abb11e20..c3fb5f9c4a4 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -112,35 +112,14 @@ struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
 
 struct dentry *efs_get_parent(struct dentry *child)
 {
-	struct dentry *parent;
-	struct inode *inode;
+	struct dentry *parent = ERR_PTR(-ENOENT);
 	efs_ino_t ino;
-	long error;
 
 	lock_kernel();
-
-	error = -ENOENT;
 	ino = efs_find_entry(child->d_inode, "..", 2);
-	if (!ino)
-		goto fail;
-
-	inode = efs_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto fail;
-	}
-
-	error = -ENOMEM;
-	parent = d_alloc_anon(inode);
-	if (!parent)
-		goto fail_iput;
-
+	if (ino)
+		parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
 	unlock_kernel();
-	return parent;
 
- fail_iput:
-	iput(inode);
- fail:
-	unlock_kernel();
-	return ERR_PTR(error);
+	return parent;
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cc91227d3bb..7b0f75dcf80 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -366,8 +366,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
-	if (!result)
-		result = ERR_PTR(-ESTALE);
 	if (IS_ERR(result))
 		return result;
 
@@ -422,8 +420,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 
 		target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
 				fh_len, fileid_type);
-		if (!target_dir)
-			goto err_result;
 		err = PTR_ERR(target_dir);
 		if (IS_ERR(target_dir))
 			goto err_result;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 80c97fd8c57..a1b328ab1e5 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -73,8 +73,6 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 struct dentry *ext2_get_parent(struct dentry *child)
 {
 	unsigned long ino;
-	struct dentry *parent;
-	struct inode *inode;
 	struct dentry dotdot;
 
 	dotdot.d_name.name = "..";
@@ -83,16 +81,7 @@ struct dentry *ext2_get_parent(struct dentry *child)
 	ino = ext2_inode_by_name(child->d_inode, &dotdot);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
-	inode = ext2_iget(child->d_inode->i_sb, ino);
-
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino));
 } 
 
 /*
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index de13e919cd8..880b54400ac 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1057,8 +1057,6 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 struct dentry *ext3_get_parent(struct dentry *child)
 {
 	unsigned long ino;
-	struct dentry *parent;
-	struct inode *inode;
 	struct dentry dotdot;
 	struct ext3_dir_entry_2 * de;
 	struct buffer_head *bh;
@@ -1068,7 +1066,6 @@ struct dentry *ext3_get_parent(struct dentry *child)
 	dotdot.d_parent = child; /* confusing, isn't it! */
 
 	bh = ext3_find_entry(&dotdot, &de);
-	inode = NULL;
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
@@ -1080,16 +1077,7 @@ struct dentry *ext3_get_parent(struct dentry *child)
 		return ERR_PTR(-EIO);
 	}
 
-	inode = ext3_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
 }
 
 #define S_SHIFT 12
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92db9e94514..5b93a7d94d4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1083,16 +1083,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 		return ERR_PTR(-EIO);
 	}
 
-	inode = ext4_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
 }
 
 #define S_SHIFT 12
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d12cdf2a040..19eafbe3c37 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -681,33 +681,24 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 			inode = NULL;
 		}
 	}
-	if (!inode) {
-		/* For now, do nothing
-		 * What we could do is:
-		 * follow the file starting at fh[4], and record
-		 * the ".." entry, and the name of the fh[2] entry.
-		 * The follow the ".." file finding the next step up.
-		 * This way we build a path to the root of
-		 * the tree. If this works, we lookup the path and so
-		 * get this inode into the cache.
-		 * Finally try the fat_iget lookup again
-		 * If that fails, then weare totally out of luck
-		 * But all that is for another day
-		 */
-	}
-	if (!inode)
-		return ERR_PTR(-ESTALE);
-
 
-	/* now to find a dentry.
-	 * If possible, get a well-connected one
+	/*
+	 * For now, do nothing if the inode is not found.
+	 *
+	 * What we could do is:
+	 *
+	 *	- follow the file starting at fh[4], and record the ".." entry,
+	 *	  and the name of the fh[2] entry.
+	 *	- then follow the ".." file finding the next step up.
+	 *
+	 * This way we build a path to the root of the tree. If this works, we
+	 * lookup the path and so get this inode into the cache.  Finally try
+	 * the fat_iget lookup again.  If that fails, then we are totally out
+	 * of luck.  But all that is for another day
 	 */
-	result = d_alloc_anon(inode);
-	if (result == NULL) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	result->d_op = sb->s_root->d_op;
+	result = d_obtain_alias(inode);
+	if (!IS_ERR(result))
+		result->d_op = sb->s_root->d_op;
 	return result;
 }
 
@@ -754,15 +745,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
 	}
 	inode = fat_build_inode(sb, de, i_pos);
 	brelse(bh);
-	if (IS_ERR(inode)) {
-		parent = ERR_CAST(inode);
-		goto out;
-	}
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
+
+	parent = d_obtain_alias(inode);
 out:
 	unlock_super(sb);
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 54b1f0e1ef5..2e99f34b443 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -596,12 +596,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
 	if (inode->i_generation != handle->generation)
 		goto out_iput;
 
-	entry = d_alloc_anon(inode);
-	err = -ENOMEM;
-	if (!entry)
-		goto out_iput;
-
-	if (get_node_id(inode) != FUSE_ROOT_ID) {
+	entry = d_obtain_alias(inode);
+	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
 		entry->d_op = &fuse_dentry_operations;
 		fuse_invalidate_entry_cache(entry);
 	}
@@ -696,17 +692,14 @@ static struct dentry *fuse_get_parent(struct dentry *child)
 	name.name = "..";
 	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
 			       &name, &outarg, &inode);
-	if (err && err != -ENOENT)
+	if (err) {
+		if (err == -ENOENT)
+			return ERR_PTR(-ESTALE);
 		return ERR_PTR(err);
-	if (err || !inode)
-		return ERR_PTR(-ESTALE);
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
 	}
-	if (get_node_id(inode) != FUSE_ROOT_ID) {
+
+	parent = d_obtain_alias(inode);
+	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
 		parent->d_op = &fuse_dentry_operations;
 		fuse_invalidate_entry_cache(parent);
 	}
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 9cda8536530..bbb8c36403a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -130,28 +130,17 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
 	struct qstr dotdot;
-	struct inode *inode;
 	struct dentry *dentry;
 
-	gfs2_str2qstr(&dotdot, "..");
-	inode = gfs2_lookupi(child->d_inode, &dotdot, 1);
-
-	if (!inode)
-		return ERR_PTR(-ENOENT);
 	/*
-	 * In case of an error, @inode carries the error value, and we
-	 * have to return that as a(n invalid) pointer to dentry.
+	 * XXX(hch): it would be a good idea to keep this around as a
+	 *	     static variable.
 	 */
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
+	gfs2_str2qstr(&dotdot, "..");
 
-	dentry->d_op = &gfs2_dops;
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
 	return dentry;
 }
 
@@ -233,13 +222,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	gfs2_glock_dq_uninit(&i_gh);
 
 out_inode:
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	dentry->d_op = &gfs2_dops;
+	dentry = d_obtain_alias(inode);
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
 	return dentry;
 
 fail_rgd:
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index bb219138331..e81a30593ba 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -22,7 +22,7 @@ isofs_export_iget(struct super_block *sb,
 		  __u32 generation)
 {
 	struct inode *inode;
-	struct dentry *result;
+
 	if (block == 0)
 		return ERR_PTR(-ESTALE);
 	inode = isofs_iget(sb, block, offset);
@@ -32,12 +32,7 @@ isofs_export_iget(struct super_block *sb,
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 /* This function is surprisingly simple.  The trick is understanding
@@ -51,7 +46,6 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
 	unsigned long parent_offset = 0;
 	struct inode *child_inode = child->d_inode;
 	struct iso_inode_info *e_child_inode = ISOFS_I(child_inode);
-	struct inode *parent_inode = NULL;
 	struct iso_directory_record *de = NULL;
 	struct buffer_head * bh = NULL;
 	struct dentry *rv = NULL;
@@ -104,28 +98,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
 	/* Normalize */
 	isofs_normalize_block_and_offset(de, &parent_block, &parent_offset);
 
-	/* Get the inode. */
-	parent_inode = isofs_iget(child_inode->i_sb,
-				  parent_block,
-				  parent_offset);
-	if (IS_ERR(parent_inode)) {
-		rv = ERR_CAST(parent_inode);
-		if (rv != ERR_PTR(-ENOMEM))
-			rv = ERR_PTR(-EACCES);
-		goto out;
-	}
-
-	/* Allocate the dentry. */
-	rv = d_alloc_anon(parent_inode);
-	if (rv == NULL) {
-		rv = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
+	rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block,
+				     parent_offset));
  out:
-	if (bh) {
+	if (bh)
 		brelse(bh);
-	}
 	return rv;
 }
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 2aba8238681..e199dde7b83 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1511,25 +1511,12 @@ struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
 
 struct dentry *jfs_get_parent(struct dentry *dentry)
 {
-	struct super_block *sb = dentry->d_inode->i_sb;
-	struct dentry *parent = ERR_PTR(-ENOENT);
-	struct inode *inode;
 	unsigned long parent_ino;
 
 	parent_ino =
 		le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot);
-	inode = jfs_iget(sb, parent_ino);
-	if (IS_ERR(inode)) {
-		parent = ERR_CAST(inode);
-	} else {
-		parent = d_alloc_anon(inode);
-		if (!parent) {
-			parent = ERR_PTR(-ENOMEM);
-			iput(inode);
-		}
-	}
 
-	return parent;
+	return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino));
 }
 
 const struct inode_operations jfs_dir_inode_operations = {
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index fae97196daa..b7c9b2df1f2 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -107,11 +107,10 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	mntroot = d_alloc_anon(inode);
-	if (!mntroot) {
-		iput(inode);
+	mntroot = d_obtain_alias(inode);
+	if (IS_ERR(mntroot)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
-		return ERR_PTR(-ENOMEM);
+		return mntroot;
 	}
 
 	security_d_instantiate(mntroot, inode);
@@ -277,11 +276,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	mntroot = d_alloc_anon(inode);
-	if (!mntroot) {
-		iput(inode);
+	mntroot = d_obtain_alias(inode);
+	if (IS_ERR(mntroot)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
-		return ERR_PTR(-ENOMEM);
+		return mntroot;
 	}
 
 	security_d_instantiate(mntroot, inode);
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 9e8a95be7a1..2ca00153b6e 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -304,8 +304,6 @@ static struct dentry *ntfs_get_parent(struct dentry *child_dent)
 	ntfs_attr_search_ctx *ctx;
 	ATTR_RECORD *attr;
 	FILE_NAME_ATTR *fn;
-	struct inode *parent_vi;
-	struct dentry *parent_dent;
 	unsigned long parent_ino;
 	int err;
 
@@ -345,24 +343,8 @@ try_next:
 	/* Release the search context and the mft record of the child. */
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(ni);
-	/* Get the inode of the parent directory. */
-	parent_vi = ntfs_iget(vi->i_sb, parent_ino);
-	if (IS_ERR(parent_vi) || unlikely(is_bad_inode(parent_vi))) {
-		if (!IS_ERR(parent_vi))
-			iput(parent_vi);
-		ntfs_error(vi->i_sb, "Failed to get parent directory inode "
-				"0x%lx of child inode 0x%lx.", parent_ino,
-				vi->i_ino);
-		return ERR_PTR(-EACCES);
-	}
-	/* Finally get a dentry for the parent directory and return it. */
-	parent_dent = d_alloc_anon(parent_vi);
-	if (unlikely(!parent_dent)) {
-		iput(parent_vi);
-		return ERR_PTR(-ENOMEM);
-	}
-	ntfs_debug("Done for inode 0x%lx.", vi->i_ino);
-	return parent_dent;
+
+	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
 }
 
 static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 67527cebf21..2f27b332d8b 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -68,14 +68,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 	}
 
-	result = d_alloc_anon(inode);
-
-	if (!result) {
-		iput(inode);
-		mlog_errno(-ENOMEM);
-		return ERR_PTR(-ENOMEM);
-	}
-	result->d_op = &ocfs2_dentry_ops;
+	result = d_obtain_alias(inode);
+	if (!IS_ERR(result))
+		result->d_op = &ocfs2_dentry_ops;
 
 	mlog_exit_ptr(result);
 	return result;
@@ -86,7 +81,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	int status;
 	u64 blkno;
 	struct dentry *parent;
-	struct inode *inode;
 	struct inode *dir = child->d_inode;
 
 	mlog_entry("(0x%p, '%.*s')\n", child,
@@ -109,21 +103,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 		goto bail_unlock;
 	}
 
-	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
-	if (IS_ERR(inode)) {
-		mlog(ML_ERROR, "Unable to create inode %llu\n",
-		     (unsigned long long)blkno);
-		parent = ERR_PTR(-EACCES);
-		goto bail_unlock;
-	}
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-
-	parent->d_op = &ocfs2_dentry_ops;
+	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
+	if (!IS_ERR(parent))
+		parent->d_op = &ocfs2_dentry_ops;
 
 bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 5699171212a..6c4c2c69449 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1522,7 +1522,6 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb,
 
 {
 	struct cpu_key key;
-	struct dentry *result;
 	struct inode *inode;
 
 	key.on_disk_key.k_objectid = objectid;
@@ -1535,16 +1534,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb,
 		inode = NULL;
 	}
 	reiserfs_write_unlock(sb);
-	if (!inode)
-		inode = ERR_PTR(-ESTALE);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+
+	return d_obtain_alias(inode);
 }
 
 struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c1add28dd45..f89ebb943f3 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -383,7 +383,6 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
 	struct inode *inode = NULL;
 	struct reiserfs_dir_entry de;
 	INITIALIZE_PATH(path_to_entry);
-	struct dentry *parent;
 	struct inode *dir = child->d_inode;
 
 	if (dir->i_nlink == 0) {
@@ -401,15 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
 	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
 	reiserfs_write_unlock(dir->i_sb);
 
-	if (!inode || IS_ERR(inode)) {
-		return ERR_PTR(-EACCES);
-	}
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(inode);
 }
 
 /* add entry to the directory (entry can be hidden). 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d3231947db1..7578fae12d3 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1243,7 +1243,6 @@ end_rename:
 
 static struct dentry *udf_get_parent(struct dentry *child)
 {
-	struct dentry *parent;
 	struct inode *inode = NULL;
 	struct dentry dotdot;
 	struct fileIdentDesc cfi;
@@ -1266,13 +1265,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
 		goto out_unlock;
 	unlock_kernel();
 
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-
-	return parent;
+	return d_obtain_alias(inode);
 out_unlock:
 	unlock_kernel();
 	return ERR_PTR(-EACCES);
@@ -1283,7 +1276,6 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 					u16 partref, __u32 generation)
 {
 	struct inode *inode;
-	struct dentry *result;
 	kernel_lb_addr loc;
 
 	if (block == 0)
@@ -1300,12 +1292,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 static struct dentry *udf_fh_to_dentry(struct super_block *sb,
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 24fd598af84..7f7abec25e1 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -148,7 +148,6 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 {
 	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
 	struct inode		*inode = NULL;
-	struct dentry		*result;
 
 	if (fh_len < xfs_fileid_length(fileid_type))
 		return NULL;
@@ -164,16 +163,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 STATIC struct dentry *
@@ -182,7 +172,6 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
 {
 	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
 	struct inode		*inode = NULL;
-	struct dentry		*result;
 
 	switch (fileid_type) {
 	case FILEID_INO32_GEN_PARENT:
@@ -195,16 +184,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 STATIC struct dentry *
@@ -213,18 +193,12 @@ xfs_fs_get_parent(
 {
 	int			error;
 	struct xfs_inode	*cip;
-	struct dentry		*parent;
 
 	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
-	parent = d_alloc_anon(VFS_I(cip));
-	if (unlikely(!parent)) {
-		iput(VFS_I(cip));
-		return ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(VFS_I(cip));
 }
 
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 48799ba7e3e..d3438c72dca 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -311,11 +311,10 @@ xfs_open_by_handle(
 		return new_fd;
 	}
 
-	dentry = d_alloc_anon(inode);
-	if (dentry == NULL) {
-		iput(inode);
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
 		put_unused_fd(new_fd);
-		return -XFS_ERROR(ENOMEM);
+		return PTR_ERR(dentry);
 	}
 
 	/* Ensure umount returns EBUSY on umounts while this file is open. */
-- 
cgit v1.2.3


From 9308a6128d9074e348d9f9b5822546fe12a794a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Aug 2008 15:49:12 +0200
Subject: [PATCH] kill d_alloc_anon

Remove d_alloc_anon now that no users are left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 108 +++++++++++++++++++++---------------------------------------
 1 file changed, 37 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d45ff7f5ecc..1710d2484fd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1110,70 +1110,6 @@ static inline struct hlist_head *d_hash(struct dentry *parent,
 	return dentry_hashtable + (hash & D_HASHMASK);
 }
 
-/**
- * d_alloc_anon - allocate an anonymous dentry
- * @inode: inode to allocate the dentry for
- *
- * This is similar to d_alloc_root.  It is used by filesystems when
- * creating a dentry for a given inode, often in the process of 
- * mapping a filehandle to a dentry.  The returned dentry may be
- * anonymous, or may have a full name (if the inode was already
- * in the cache).  The file system may need to make further
- * efforts to connect this dentry into the dcache properly.
- *
- * When called on a directory inode, we must ensure that
- * the inode only ever has one dentry.  If a dentry is
- * found, that is returned instead of allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry.  If %NULL is returned (indicating kmalloc failure),
- * the reference on the inode has not been released.
- */
-
-struct dentry * d_alloc_anon(struct inode *inode)
-{
-	static const struct qstr anonstring = { .name = "" };
-	struct dentry *tmp;
-	struct dentry *res;
-
-	if ((res = d_find_alias(inode))) {
-		iput(inode);
-		return res;
-	}
-
-	tmp = d_alloc(NULL, &anonstring);
-	if (!tmp)
-		return NULL;
-
-	tmp->d_parent = tmp; /* make sure dput doesn't croak */
-	
-	spin_lock(&dcache_lock);
-	res = __d_find_alias(inode, 0);
-	if (!res) {
-		/* attach a disconnected dentry */
-		res = tmp;
-		tmp = NULL;
-		spin_lock(&res->d_lock);
-		res->d_sb = inode->i_sb;
-		res->d_parent = res;
-		res->d_inode = inode;
-		res->d_flags |= DCACHE_DISCONNECTED;
-		res->d_flags &= ~DCACHE_UNHASHED;
-		list_add(&res->d_alias, &inode->i_dentry);
-		hlist_add_head(&res->d_hash, &inode->i_sb->s_anon);
-		spin_unlock(&res->d_lock);
-
-		inode = NULL; /* don't drop reference */
-	}
-	spin_unlock(&dcache_lock);
-
-	if (inode)
-		iput(inode);
-	if (tmp)
-		dput(tmp);
-	return res;
-}
-
 /**
  * d_obtain_alias - find or allocate a dentry for a given inode
  * @inode: inode to allocate the dentry for
@@ -1194,19 +1130,50 @@ struct dentry * d_alloc_anon(struct inode *inode)
  */
 struct dentry *d_obtain_alias(struct inode *inode)
 {
-	struct dentry *dentry;
+	static const struct qstr anonstring = { .name = "" };
+	struct dentry *tmp;
+	struct dentry *res;
 
 	if (!inode)
 		return ERR_PTR(-ESTALE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		dentry = ERR_PTR(-ENOMEM);
+	res = d_find_alias(inode);
+	if (res)
+		goto out_iput;
+
+	tmp = d_alloc(NULL, &anonstring);
+	if (!tmp) {
+		res = ERR_PTR(-ENOMEM);
+		goto out_iput;
 	}
-	return dentry;
+	tmp->d_parent = tmp; /* make sure dput doesn't croak */
+
+	spin_lock(&dcache_lock);
+	res = __d_find_alias(inode, 0);
+	if (res) {
+		spin_unlock(&dcache_lock);
+		dput(tmp);
+		goto out_iput;
+	}
+
+	/* attach a disconnected dentry */
+	spin_lock(&tmp->d_lock);
+	tmp->d_sb = inode->i_sb;
+	tmp->d_inode = inode;
+	tmp->d_flags |= DCACHE_DISCONNECTED;
+	tmp->d_flags &= ~DCACHE_UNHASHED;
+	list_add(&tmp->d_alias, &inode->i_dentry);
+	hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+	spin_unlock(&tmp->d_lock);
+
+	spin_unlock(&dcache_lock);
+	return tmp;
+
+ out_iput:
+	iput(inode);
+	return res;
 }
 EXPORT_SYMBOL_GPL(d_obtain_alias);
 
@@ -2379,7 +2346,6 @@ void __init vfs_caches_init(unsigned long mempages)
 }
 
 EXPORT_SYMBOL(d_alloc);
-EXPORT_SYMBOL(d_alloc_anon);
 EXPORT_SYMBOL(d_alloc_root);
 EXPORT_SYMBOL(d_delete);
 EXPORT_SYMBOL(d_find_alias);
-- 
cgit v1.2.3


From f3f8e17571934ea253339e15c4ec9b34ea623c6b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 11 Aug 2008 12:39:47 -0400
Subject: [PATCH] reduce the stack footprint of exportfs_decode_fh()

no need to have _two_ 256-byte arrays on stack...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exportfs/expfs.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 7b0f75dcf80..51bdc5cab06 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -94,9 +94,8 @@ find_disconnected_root(struct dentry *dentry)
  * It may already be, as the flag isn't always updated when connection happens.
  */
 static int
-reconnect_path(struct vfsmount *mnt, struct dentry *target_dir)
+reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 {
-	char nbuf[NAME_MAX+1];
 	int noprogress = 0;
 	int err = -ESTALE;
 
@@ -360,6 +359,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 {
 	const struct export_operations *nop = mnt->mnt_sb->s_export_op;
 	struct dentry *result, *alias;
+	char nbuf[NAME_MAX+1];
 	int err;
 
 	/*
@@ -379,7 +379,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 * filesystem root.
 		 */
 		if (result->d_flags & DCACHE_DISCONNECTED) {
-			err = reconnect_path(mnt, result);
+			err = reconnect_path(mnt, result, nbuf);
 			if (err)
 				goto err_result;
 		}
@@ -395,7 +395,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 * It's not a directory.  Life is a little more complicated.
 		 */
 		struct dentry *target_dir, *nresult;
-		char nbuf[NAME_MAX+1];
 
 		/*
 		 * See if either the dentry we just got from the filesystem
@@ -429,7 +428,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 * connected to the filesystem root.  The VFS really doesn't
 		 * like disconnected directories..
 		 */
-		err = reconnect_path(mnt, target_dir);
+		err = reconnect_path(mnt, target_dir, nbuf);
 		if (err) {
 			dput(target_dir);
 			goto err_result;
-- 
cgit v1.2.3


From 2628b766363aa3791e816a7e5807373ef551c776 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 31 Jul 2008 17:16:51 +0100
Subject: [PATCH] Factor out nfsd_do_readdir() into its own function

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index aa1d0d6489a..14eda20cc29 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1813,6 +1813,29 @@ out:
 	return err;
 }
 
+static int nfsd_do_readdir(struct file *file, filldir_t func,
+			   struct readdir_cd *cdp, loff_t *offsetp)
+{
+	int host_err;
+
+	/*
+	 * Read the directory entries. This silly loop is necessary because
+	 * readdir() is not guaranteed to fill up the entire buffer, but
+	 * may choose to do less.
+	 */
+	do {
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
+		host_err = vfs_readdir(file, func, cdp);
+	} while (host_err >=0 && cdp->err == nfs_ok);
+
+	*offsetp = vfs_llseek(file, 0, 1);
+
+	if (host_err)
+		return nfserrno(host_err);
+	else
+		return cdp->err;
+}
+
 /*
  * Read entries from a directory.
  * The  NFSv3/4 verifier we ignore for now.
@@ -1822,7 +1845,6 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	     struct readdir_cd *cdp, filldir_t func)
 {
 	__be32		err;
-	int 		host_err;
 	struct file	*file;
 	loff_t		offset = *offsetp;
 
@@ -1836,21 +1858,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 		goto out_close;
 	}
 
-	/*
-	 * Read the directory entries. This silly loop is necessary because
-	 * readdir() is not guaranteed to fill up the entire buffer, but
-	 * may choose to do less.
-	 */
-
-	do {
-		cdp->err = nfserr_eof; /* will be cleared on successful read */
-		host_err = vfs_readdir(file, func, cdp);
-	} while (host_err >=0 && cdp->err == nfs_ok);
-	if (host_err)
-		err = nfserrno(host_err);
-	else
-		err = cdp->err;
-	*offsetp = vfs_llseek(file, 0, 1);
+	err = nfsd_do_readdir(file, func, cdp, offsetp);
 
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
-- 
cgit v1.2.3


From 14f7dd632011bb89c035722edd6ea0d90ca6b078 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 31 Jul 2008 20:29:12 +0100
Subject: [PATCH] Copy XFS readdir hack into nfsd code.

Some file systems with their own internal locking have problems with the
way that nfsd calls the ->lookup() method from within a filldir function
called from their ->readdir() method. The recursion back into the file
system code can cause deadlock.

XFS has a fairly hackish solution to this which involves doing the
readdir() into a locally-allocated buffer, then going back through it
calling the filldir function afterwards. It's not ideal, but it works.

It's particularly suboptimal because XFS does this for local file
systems too, where it's completely unnecessary.

Copy this hack into the NFS code where it can be used only for NFS
export. In response to feedback, use it unconditionally rather than only
for the affected file systems.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 14eda20cc29..93b22f661d9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1813,27 +1813,105 @@ out:
 	return err;
 }
 
-static int nfsd_do_readdir(struct file *file, filldir_t func,
-			   struct readdir_cd *cdp, loff_t *offsetp)
+/*
+ * We do this buffering because we must not call back into the file
+ * system's ->lookup() method from the filldir callback. That may well
+ * deadlock a number of file systems.
+ *
+ * This is based heavily on the implementation of same in XFS.
+ */
+struct buffered_dirent {
+	u64		ino;
+	loff_t		offset;
+	int		namlen;
+	unsigned int	d_type;
+	char		name[];
+};
+
+struct readdir_data {
+	char		*dirent;
+	size_t		used;
+};
+
+static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
+				 loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_data *buf = __buf;
+	struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
+	unsigned int reclen;
+
+	reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
+	if (buf->used + reclen > PAGE_SIZE)
+		return -EINVAL;
+
+	de->namlen = namlen;
+	de->offset = offset;
+	de->ino = ino;
+	de->d_type = d_type;
+	memcpy(de->name, name, namlen);
+	buf->used += reclen;
+
+	return 0;
+}
+
+static int nfsd_buffered_readdir(struct file *file, filldir_t func,
+				 struct readdir_cd *cdp, loff_t *offsetp)
 {
+	struct readdir_data buf;
+	struct buffered_dirent *de;
 	int host_err;
+	int size;
+	loff_t offset;
 
-	/*
-	 * Read the directory entries. This silly loop is necessary because
-	 * readdir() is not guaranteed to fill up the entire buffer, but
-	 * may choose to do less.
-	 */
-	do {
-		cdp->err = nfserr_eof; /* will be cleared on successful read */
-		host_err = vfs_readdir(file, func, cdp);
-	} while (host_err >=0 && cdp->err == nfs_ok);
+	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
+	if (!buf.dirent)
+		return -ENOMEM;
+
+	offset = *offsetp;
+	cdp->err = nfserr_eof; /* will be cleared on successful read */
 
-	*offsetp = vfs_llseek(file, 0, 1);
+	while (1) {
+		unsigned int reclen;
+
+		buf.used = 0;
+
+		host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+		if (host_err)
+			break;
+
+		size = buf.used;
+
+		if (!size)
+			break;
+
+
+		de = (struct buffered_dirent *)buf.dirent;
+		while (size > 0) {
+			offset = de->offset;
+
+			if (func(cdp, de->name, de->namlen, de->offset,
+				 de->ino, de->d_type))
+				goto done;
+
+			if (cdp->err != nfs_ok)
+				goto done;
+
+			reclen = ALIGN(sizeof(*de) + de->namlen,
+				       sizeof(u64));
+			size -= reclen;
+			de = (struct buffered_dirent *)((char *)de + reclen);
+		}
+		offset = vfs_llseek(file, 0, 1);
+	}
+
+ done:
+	free_page((unsigned long)(buf.dirent));
 
 	if (host_err)
 		return nfserrno(host_err);
-	else
-		return cdp->err;
+
+	*offsetp = offset;
+	return cdp->err;
 }
 
 /*
@@ -1858,7 +1936,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 		goto out_close;
 	}
 
-	err = nfsd_do_readdir(file, func, cdp, offsetp);
+	err = nfsd_buffered_readdir(file, func, cdp, offsetp);
 
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
-- 
cgit v1.2.3


From d88f1833fcbb5663c86253039966f880f8f46b1a Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 31 Jul 2008 20:38:04 +0100
Subject: [PATCH] Remove XFS buffered readdir hack

Now that we've moved the readdir hack to the nfsd code, we can
remove the local version from the XFS code.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/linux-2.6/xfs_file.c | 128 --------------------------------------------
 1 file changed, 128 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5311c1acdd4..3fee790f138 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -204,15 +204,6 @@ xfs_file_fsync(
 	return -xfs_fsync(XFS_I(dentry->d_inode));
 }
 
-/*
- * Unfortunately we can't just use the clean and simple readdir implementation
- * below, because nfs might call back into ->lookup from the filldir callback
- * and that will deadlock the low-level btree code.
- *
- * Hopefully we'll find a better workaround that allows to use the optimal
- * version at least for local readdirs for 2.6.25.
- */
-#if 0
 STATIC int
 xfs_file_readdir(
 	struct file	*filp,
@@ -244,125 +235,6 @@ xfs_file_readdir(
 		return -error;
 	return 0;
 }
-#else
-
-struct hack_dirent {
-	u64		ino;
-	loff_t		offset;
-	int		namlen;
-	unsigned int	d_type;
-	char		name[];
-};
-
-struct hack_callback {
-	char		*dirent;
-	size_t		len;
-	size_t		used;
-};
-
-STATIC int
-xfs_hack_filldir(
-	void		*__buf,
-	const char	*name,
-	int		namlen,
-	loff_t		offset,
-	u64		ino,
-	unsigned int	d_type)
-{
-	struct hack_callback *buf = __buf;
-	struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used);
-	unsigned int reclen;
-
-	reclen = ALIGN(sizeof(struct hack_dirent) + namlen, sizeof(u64));
-	if (buf->used + reclen > buf->len)
-		return -EINVAL;
-
-	de->namlen = namlen;
-	de->offset = offset;
-	de->ino = ino;
-	de->d_type = d_type;
-	memcpy(de->name, name, namlen);
-	buf->used += reclen;
-	return 0;
-}
-
-STATIC int
-xfs_file_readdir(
-	struct file	*filp,
-	void		*dirent,
-	filldir_t	filldir)
-{
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-	xfs_inode_t	*ip = XFS_I(inode);
-	struct hack_callback buf;
-	struct hack_dirent *de;
-	int		error;
-	loff_t		size;
-	int		eof = 0;
-	xfs_off_t       start_offset, curr_offset, offset;
-
-	/*
-	 * Try fairly hard to get memory
-	 */
-	buf.len = PAGE_CACHE_SIZE;
-	do {
-		buf.dirent = kmalloc(buf.len, GFP_KERNEL);
-		if (buf.dirent)
-			break;
-		buf.len >>= 1;
-	} while (buf.len >= 1024);
-
-	if (!buf.dirent)
-		return -ENOMEM;
-
-	curr_offset = filp->f_pos;
-	if (curr_offset == 0x7fffffff)
-		offset = 0xffffffff;
-	else
-		offset = filp->f_pos;
-
-	while (!eof) {
-		unsigned int reclen;
-
-		start_offset = offset;
-
-		buf.used = 0;
-		error = -xfs_readdir(ip, &buf, buf.len, &offset,
-				     xfs_hack_filldir);
-		if (error || offset == start_offset) {
-			size = 0;
-			break;
-		}
-
-		size = buf.used;
-		de = (struct hack_dirent *)buf.dirent;
-		while (size > 0) {
-			curr_offset = de->offset /* & 0x7fffffff */;
-			if (filldir(dirent, de->name, de->namlen,
-					curr_offset & 0x7fffffff,
-					de->ino, de->d_type)) {
-				goto done;
-			}
-
-			reclen = ALIGN(sizeof(struct hack_dirent) + de->namlen,
-				       sizeof(u64));
-			size -= reclen;
-			de = (struct hack_dirent *)((char *)de + reclen);
-		}
-	}
-
- done:
-	if (!error) {
-		if (size == 0)
-			filp->f_pos = offset & 0x7fffffff;
-		else if (de)
-			filp->f_pos = curr_offset;
-	}
-
-	kfree(buf.dirent);
-	return error;
-}
-#endif
 
 STATIC int
 xfs_file_mmap(
-- 
cgit v1.2.3


From 5f556aab907a358c7837cc9a83c3aea4e69cff5b Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 31 Jul 2008 20:39:25 +0100
Subject: [JFFS2] Reinstate NFS exportability

Now that the readdir/lookup deadlock issues have been dealt with, we can
export JFFS2 file systems again.

(For now, you have to specify fsid manually; we should add a method to
the export_ops to handle that too.)

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/super.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index efd401257ed..4c4e18c54a5 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -22,6 +22,7 @@
 #include <linux/mtd/super.h>
 #include <linux/ctype.h>
 #include <linux/namei.h>
+#include <linux/exportfs.h>
 #include "compr.h"
 #include "nodelist.h"
 
@@ -62,6 +63,52 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino,
+					 uint32_t generation)
+{
+	/* We don't care about i_generation. We'll destroy the flash
+	   before we start re-using inode numbers anyway. And even
+	   if that wasn't true, we'd have other problems...*/
+	return jffs2_iget(sb, ino);
+}
+
+static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_get_parent(struct dentry *child)
+{
+	struct jffs2_inode_info *f;
+	uint32_t pino;
+
+	BUG_ON(!S_ISDIR(child->d_inode->i_mode));
+
+	f = JFFS2_INODE_INFO(child->d_inode);
+
+	pino = f->inocache->pino_nlink;
+
+	JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
+		    f->inocache->ino, pino);
+
+	return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
+}
+
+static struct export_operations jffs2_export_ops = {
+	.get_parent = jffs2_get_parent,
+	.fh_to_dentry = jffs2_fh_to_dentry,
+	.fh_to_parent = jffs2_fh_to_parent,
+};
+
 static const struct super_operations jffs2_super_operations =
 {
 	.alloc_inode =	jffs2_alloc_inode,
@@ -104,6 +151,7 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&c->inocache_lock);
 
 	sb->s_op = &jffs2_super_operations;
+	sb->s_export_op = &jffs2_export_ops;
 	sb->s_flags = sb->s_flags | MS_NOATIME;
 	sb->s_xattr = jffs2_xattr_handlers;
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-- 
cgit v1.2.3


From 734711abac46c8fee4d70cc9876ebc6d9edb4971 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Aug 2008 07:26:48 -0400
Subject: [PATCH] get rid of on-stack fake dentry in ext3_get_parent()

Better pass parent and qstr to ext3_find_entry() explicitly than
use such kludges, especially since the stack footprint is nasty
enough and we have every chance to be deep in call chain.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/namei.c | 70 +++++++++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 880b54400ac..3e5edc92aa0 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -159,7 +159,7 @@ static void dx_set_count (struct dx_entry *entries, unsigned value);
 static void dx_set_limit (struct dx_entry *entries, unsigned value);
 static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
 static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static struct dx_frame *dx_probe(struct qstr *entry,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
@@ -176,8 +176,9 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
-static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-		       struct ext3_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err);
 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
@@ -342,7 +343,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(struct qstr *entry, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
 	unsigned count, indirect;
@@ -353,8 +354,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (dentry)
-		dir = dentry->d_parent->d_inode;
 	if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
 		goto fail;
 	root = (struct dx_root *) bh->b_data;
@@ -370,8 +369,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	}
 	hinfo->hash_version = root->info.hash_version;
 	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
-	if (dentry)
-		ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	if (entry)
+		ext3fs_dirhash(entry->name, entry->len, hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
@@ -803,15 +802,15 @@ static inline int ext3_match (int len, const char * const name,
  */
 static inline int search_dirblock(struct buffer_head * bh,
 				  struct inode *dir,
-				  struct dentry *dentry,
+				  struct qstr *child,
 				  unsigned long offset,
 				  struct ext3_dir_entry_2 ** res_dir)
 {
 	struct ext3_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = child->name;
+	int namelen = child->len;
 
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -850,8 +849,9 @@ static inline int search_dirblock(struct buffer_head * bh,
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext3_find_entry (struct dentry *dentry,
-					struct ext3_dir_entry_2 ** res_dir)
+static struct buffer_head *ext3_find_entry(struct inode *dir,
+					struct qstr *entry,
+					struct ext3_dir_entry_2 **res_dir)
 {
 	struct super_block * sb;
 	struct buffer_head * bh_use[NAMEI_RA_SIZE];
@@ -863,16 +863,15 @@ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
 				   buffer */
 	int num = 0;
 	int nblocks, i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = dentry->d_name.len;
+	namelen = entry->len;
 	if (namelen > EXT3_NAME_LEN)
 		return NULL;
 	if (is_dx(dir)) {
-		bh = ext3_dx_find_entry(dentry, res_dir, &err);
+		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -923,7 +922,7 @@ restart:
 			brelse(bh);
 			goto next;
 		}
-		i = search_dirblock(bh, dir, dentry,
+		i = search_dirblock(bh, dir, entry,
 			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT3_I(dir)->i_dir_start_lookup = block;
@@ -957,8 +956,9 @@ cleanup_and_exit:
 	return ret;
 }
 
-static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-		       struct ext3_dir_entry_2 **res_dir, int *err)
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err)
 {
 	struct super_block * sb;
 	struct dx_hash_info	hinfo;
@@ -968,14 +968,13 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 	struct buffer_head *bh;
 	unsigned long block;
 	int retval;
-	int namelen = dentry->d_name.len;
-	const u8 *name = dentry->d_name.name;
-	struct inode *dir = dentry->d_parent->d_inode;
+	int namelen = entry->len;
+	const u8 *name = entry->name;
 
 	sb = dir->i_sb;
 	/* NFS may look up ".." - look at dx_root directory block */
-	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+	if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
+		if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
 			return NULL;
 	} else {
 		frame = frames;
@@ -1036,7 +1035,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 	if (dentry->d_name.len > EXT3_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext3_find_entry(dentry, &de);
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
 		unsigned long ino = le32_to_cpu(de->inode);
@@ -1057,15 +1056,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 struct dentry *ext3_get_parent(struct dentry *child)
 {
 	unsigned long ino;
-	struct dentry dotdot;
+	struct qstr dotdot = {.name = "..", .len = 2};
 	struct ext3_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_parent = child; /* confusing, isn't it! */
-
-	bh = ext3_find_entry(&dotdot, &de);
+	bh = ext3_find_entry(child->d_inode, &dotdot, &de);
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
@@ -1491,7 +1486,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct ext3_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 	entries = frame->entries;
@@ -2044,7 +2039,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext3_find_entry (dentry, &de);
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2106,7 +2101,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 		handle->h_sync = 1;
 
 	retval = -ENOENT;
-	bh = ext3_find_entry (dentry, &de);
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2264,7 +2259,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		handle->h_sync = 1;
 
-	old_bh = ext3_find_entry (old_dentry, &old_de);
+	old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -2277,7 +2272,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext3_find_entry (new_dentry, &new_de);
+	new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
 			brelse (new_bh);
@@ -2343,7 +2338,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		struct buffer_head *old_bh2;
 		struct ext3_dir_entry_2 *old_de2;
 
-		old_bh2 = ext3_find_entry(old_dentry, &old_de2);
+		old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
+					  &old_de2);
 		if (old_bh2) {
 			retval = ext3_delete_entry(handle, old_dir,
 						   old_de2, old_bh2);
-- 
cgit v1.2.3


From a9885444f7ff6e9156adb1adf5558ded9a39ad0a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Aug 2008 07:28:39 -0400
Subject: [PATCH] get rid of on-stack dentry in ext2_get_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/dir.c   | 14 +++++++-------
 fs/ext2/ext2.h  |  4 ++--
 fs/ext2/namei.c | 17 ++++++-----------
 3 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 11a49ce8439..9a0fc400f91 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -354,11 +354,11 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
  * (as a parameter - res_dir). Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  */
-struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
-			struct dentry *dentry, struct page ** res_page)
+struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
+			struct qstr *child, struct page ** res_page)
 {
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = child->name;
+	int namelen = child->len;
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
@@ -431,13 +431,13 @@ struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
 	return de;
 }
 
-ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
+ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 {
 	ino_t res = 0;
-	struct ext2_dir_entry_2 * de;
+	struct ext2_dir_entry_2 *de;
 	struct page *page;
 	
-	de = ext2_find_entry (dir, dentry, &page);
+	de = ext2_find_entry (dir, child, &page);
 	if (de) {
 		res = le32_to_cpu(de->inode);
 		ext2_put_page(page);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index bae998c1e44..3203042b36e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -105,9 +105,9 @@ extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_wind
 
 /* dir.c */
 extern int ext2_add_link (struct dentry *, struct inode *);
-extern ino_t ext2_inode_by_name(struct inode *, struct dentry *);
+extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
 extern int ext2_make_empty(struct inode *, struct inode *);
-extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct dentry *, struct page **);
+extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index a1b328ab1e5..2a747252ec1 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -60,7 +60,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 	if (dentry->d_name.len > EXT2_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ino = ext2_inode_by_name(dir, dentry);
+	ino = ext2_inode_by_name(dir, &dentry->d_name);
 	inode = NULL;
 	if (ino) {
 		inode = ext2_iget(dir->i_sb, ino);
@@ -72,13 +72,8 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 
 struct dentry *ext2_get_parent(struct dentry *child)
 {
-	unsigned long ino;
-	struct dentry dotdot;
-
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-
-	ino = ext2_inode_by_name(child->d_inode, &dotdot);
+	struct qstr dotdot = {.name = "..", .len = 2};
+	unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
 	return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino));
@@ -246,7 +241,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
 	struct page * page;
 	int err = -ENOENT;
 
-	de = ext2_find_entry (dir, dentry, &page);
+	de = ext2_find_entry (dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
 
@@ -288,7 +283,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	struct ext2_dir_entry_2 * old_de;
 	int err = -ENOENT;
 
-	old_de = ext2_find_entry (old_dir, old_dentry, &old_page);
+	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
 
@@ -308,7 +303,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 			goto out_dir;
 
 		err = -ENOENT;
-		new_de = ext2_find_entry (new_dir, new_dentry, &new_page);
+		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
-- 
cgit v1.2.3


From 53c9c5c0e32c69f9df1822e47671c13e3402c82f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Aug 2008 07:29:52 -0400
Subject: [PATCH] prepare vfs_readdir() callers to returning filldir result

It's not the final state, but it allows moving ->readdir() instances
to passing filldir return value to caller of vfs_readdir().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c         | 22 ++++++++--------------
 fs/exportfs/expfs.c |  7 ++++---
 fs/nfsd/vfs.c       | 11 +++++++++--
 fs/readdir.c        | 22 ++++++++--------------
 4 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 5f9ec449c79..cb36245f9fe 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -869,7 +869,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 	buf.dirent = dirent;
 
 	error = vfs_readdir(file, compat_fillonedir, &buf);
-	if (error >= 0)
+	if (buf.result)
 		error = buf.result;
 
 	fput(file);
@@ -956,9 +956,8 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 	buf.error = 0;
 
 	error = vfs_readdir(file, compat_filldir, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		if (put_user(file->f_pos, &lastdirent->d_off))
@@ -966,8 +965,6 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 		else
 			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
@@ -1047,19 +1044,16 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 	buf.error = 0;
 
 	error = vfs_readdir(file, compat_filldir64, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		typeof(lastdirent->d_off) d_off = file->f_pos;
-		error = -EFAULT;
 		if (__put_user_unaligned(d_off, &lastdirent->d_off))
-			goto out_putf;
-		error = count - buf.count;
+			error = -EFAULT;
+		else
+			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 51bdc5cab06..80246bad1b7 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -280,13 +280,14 @@ static int get_name(struct vfsmount *mnt, struct dentry *dentry,
 		int old_seq = buffer.sequence;
 
 		error = vfs_readdir(file, filldir_one, &buffer);
+		if (buffer.found) {
+			error = 0;
+			break;
+		}
 
 		if (error < 0)
 			break;
 
-		error = 0;
-		if (buffer.found)
-			break;
 		error = -ENOENT;
 		if (old_seq == buffer.sequence)
 			break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 93b22f661d9..e3e37f7c847 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1831,6 +1831,7 @@ struct buffered_dirent {
 struct readdir_data {
 	char		*dirent;
 	size_t		used;
+	int		full;
 };
 
 static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
@@ -1841,8 +1842,10 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
 	unsigned int reclen;
 
 	reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
-	if (buf->used + reclen > PAGE_SIZE)
+	if (buf->used + reclen > PAGE_SIZE) {
+		buf->full = 1;
 		return -EINVAL;
+	}
 
 	de->namlen = namlen;
 	de->offset = offset;
@@ -1874,9 +1877,13 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		unsigned int reclen;
 
 		buf.used = 0;
+		buf.full = 0;
 
 		host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
-		if (host_err)
+		if (buf.full)
+			host_err = 0;
+
+		if (host_err < 0)
 			break;
 
 		size = buf.used;
diff --git a/fs/readdir.c b/fs/readdir.c
index 93a7559bbfd..b318d9b5af2 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -117,7 +117,7 @@ asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * di
 	buf.dirent = dirent;
 
 	error = vfs_readdir(file, fillonedir, &buf);
-	if (error >= 0)
+	if (buf.result)
 		error = buf.result;
 
 	fput(file);
@@ -209,9 +209,8 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * diren
 	buf.error = 0;
 
 	error = vfs_readdir(file, filldir, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		if (put_user(file->f_pos, &lastdirent->d_off))
@@ -219,8 +218,6 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * diren
 		else
 			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
@@ -293,19 +290,16 @@ asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * d
 	buf.error = 0;
 
 	error = vfs_readdir(file, filldir64, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		typeof(lastdirent->d_off) d_off = file->f_pos;
-		error = -EFAULT;
 		if (__put_user(d_off, &lastdirent->d_off))
-			goto out_putf;
-		error = count - buf.count;
+			error = -EFAULT;
+		else
+			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
-- 
cgit v1.2.3


From c002a6c7977320f95b5edede5ce4e0eeecf291ff Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 17 Aug 2008 17:21:18 +0100
Subject: [PATCH] Optimise NFS readdir hack slightly.

Avoid calling the underlying ->readdir() again when we reached the end
already; keep going round the loop only if we stopped due to our own
buffer being full.

[AV: tidy the things up a bit, while we are there]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index e3e37f7c847..49d4b8725ca 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1891,7 +1891,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		if (!size)
 			break;
 
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1908,7 +1907,9 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		offset = vfs_llseek(file, 0, 1);
+		offset = vfs_llseek(file, 0, SEEK_CUR);
+		if (!buf.full)
+			break;
 	}
 
  done:
-- 
cgit v1.2.3


From 8966c5e0fc867f5a7da5756b4cd1b8bbbed3d5dd Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 18 Aug 2008 15:36:47 +0100
Subject: [JFFS2] Use d_splice_alias() not d_add() in jffs2_lookup()

Now that JFFS2 can be exported by NFS, we need to get this right.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/dir.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index b1aaae823a5..621bdfa994e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -108,9 +108,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 		}
 	}
 
-	d_add(target, inode);
-
-	return NULL;
+	return d_splice_alias(inode, target);
 }
 
 /***********************************************************************/
-- 
cgit v1.2.3


From 6de24f0ed08054b2a202902e4d63beff27654db8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 28 Aug 2008 06:25:49 +0400
Subject: [PATCH 1/2] anondev: init IDR statically

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/super.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index e931ae9511f..dd23bf927fb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -682,7 +682,7 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-static struct idr unnamed_dev_idr;
+static DEFINE_IDR(unnamed_dev_idr);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 
 int set_anon_super(struct super_block *s, void *data)
@@ -726,11 +726,6 @@ void kill_anon_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_anon_super);
 
-void __init unnamed_dev_init(void)
-{
-	idr_init(&unnamed_dev_idr);
-}
-
 void kill_litter_super(struct super_block *sb)
 {
 	if (sb->s_root)
-- 
cgit v1.2.3


From ad76cbc63b9db7c98da49af3182a783ca1c80a5d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 28 Aug 2008 06:26:23 +0400
Subject: [PATCH 2/2] anondev: switch to IDA

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/super.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index dd23bf927fb..f31ef824d06 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -682,7 +682,7 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-static DEFINE_IDR(unnamed_dev_idr);
+static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 
 int set_anon_super(struct super_block *s, void *data)
@@ -691,10 +691,10 @@ int set_anon_super(struct super_block *s, void *data)
 	int error;
 
  retry:
-	if (idr_pre_get(&unnamed_dev_idr, GFP_ATOMIC) == 0)
+	if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
 		return -ENOMEM;
 	spin_lock(&unnamed_dev_lock);
-	error = idr_get_new(&unnamed_dev_idr, NULL, &dev);
+	error = ida_get_new(&unnamed_dev_ida, &dev);
 	spin_unlock(&unnamed_dev_lock);
 	if (error == -EAGAIN)
 		/* We raced and lost with another CPU. */
@@ -704,7 +704,7 @@ int set_anon_super(struct super_block *s, void *data)
 
 	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
 		spin_lock(&unnamed_dev_lock);
-		idr_remove(&unnamed_dev_idr, dev);
+		ida_remove(&unnamed_dev_ida, dev);
 		spin_unlock(&unnamed_dev_lock);
 		return -EMFILE;
 	}
@@ -720,7 +720,7 @@ void kill_anon_super(struct super_block *sb)
 
 	generic_shutdown_super(sb);
 	spin_lock(&unnamed_dev_lock);
-	idr_remove(&unnamed_dev_idr, slot);
+	ida_remove(&unnamed_dev_ida, slot);
 	spin_unlock(&unnamed_dev_lock);
 }
 
-- 
cgit v1.2.3


From 9fbb76ce0fe96c07c44ba2aec3dc99f4b8d2b9c6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Oct 2008 00:15:17 -0400
Subject: [PATCH] get rid of on-stack dentry in udf

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/namei.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 7578fae12d3..082409cd4b8 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -142,7 +142,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 }
 
 static struct fileIdentDesc *udf_find_entry(struct inode *dir,
-					    struct dentry *dentry,
+					    struct qstr *child,
 					    struct udf_fileident_bh *fibh,
 					    struct fileIdentDesc *cfi)
 {
@@ -159,8 +159,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	sector_t offset;
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo = UDF_I(dir);
-	int isdotdot = dentry->d_name.len == 2 &&
-		dentry->d_name.name[0] == '.' && dentry->d_name.name[1] == '.';
+	int isdotdot = child->len == 2 &&
+		child->name[0] == '.' && child->name[1] == '.';
 
 	size = udf_ext0_offset(dir) + dir->i_size;
 	f_pos = udf_ext0_offset(dir);
@@ -238,8 +238,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 			continue;
 
 		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-		if (flen && udf_match(flen, fname, dentry->d_name.len,
-				      dentry->d_name.name))
+		if (flen && udf_match(flen, fname, child->len, child->name))
 			goto out_ok;
 	}
 
@@ -283,7 +282,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 	} else
 #endif /* UDF_RECOVERY */
 
-	if (udf_find_entry(dir, dentry, &fibh, &cfi)) {
+	if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
 		if (fibh.sbh != fibh.ebh)
 			brelse(fibh.ebh);
 		brelse(fibh.sbh);
@@ -783,7 +782,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 
 	retval = -ENOENT;
 	lock_kernel();
-	fi = udf_find_entry(dir, dentry, &fibh, &cfi);
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
 		goto out;
 
@@ -829,7 +828,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 
 	retval = -ENOENT;
 	lock_kernel();
-	fi = udf_find_entry(dir, dentry, &fibh, &cfi);
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
 		goto out;
 
@@ -1113,7 +1112,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
 	lock_kernel();
-	ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi);
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	if (ofi) {
 		if (ofibh.sbh != ofibh.ebh)
 			brelse(ofibh.ebh);
@@ -1124,7 +1123,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    != old_inode->i_ino)
 		goto end_rename;
 
-	nfi = udf_find_entry(new_dir, new_dentry, &nfibh, &ncfi);
+	nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
 	if (nfi) {
 		if (!new_inode) {
 			if (nfibh.sbh != nfibh.ebh)
@@ -1192,7 +1191,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
 
 	/* The old fid may have moved - find it again */
-	ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi);
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	udf_delete_entry(old_dir, ofi, &ofibh, &ocfi);
 
 	if (new_inode) {
@@ -1244,13 +1243,10 @@ end_rename:
 static struct dentry *udf_get_parent(struct dentry *child)
 {
 	struct inode *inode = NULL;
-	struct dentry dotdot;
+	struct qstr dotdot = {.name = "..", .len = 2};
 	struct fileIdentDesc cfi;
 	struct udf_fileident_bh fibh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-
 	lock_kernel();
 	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
 		goto out_unlock;
-- 
cgit v1.2.3


From 871c0067d53ba2dc35897c7da1da675bf4c70511 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:27 +0900
Subject: [PATCH vfs-2.6 1/6] vfs: replace parent == dentry->d_parent by
 IS_ROOT()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/dcache.c | 21 ++++++++++++---------
 fs/namei.c  |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 1710d2484fd..c6fd1f27da5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -174,9 +174,12 @@ static struct dentry *d_kill(struct dentry *dentry)
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
-	parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		parent = NULL;
+	else
+		parent = dentry->d_parent;
 	d_free(dentry);
-	return dentry == parent ? NULL : parent;
+	return parent;
 }
 
 /* 
@@ -666,11 +669,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				BUG();
 			}
 
-			parent = dentry->d_parent;
-			if (parent == dentry)
+			if (IS_ROOT(dentry))
 				parent = NULL;
-			else
+			else {
+				parent = dentry->d_parent;
 				atomic_dec(&parent->d_count);
+			}
 
 			list_del(&dentry->d_u.d_child);
 			detached++;
@@ -1723,7 +1727,7 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
 {
 	struct dentry *p;
 
-	for (p = p2; p->d_parent != p; p = p->d_parent) {
+	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
 		if (p->d_parent == p1)
 			return 1;
 	}
@@ -2168,10 +2172,9 @@ int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
 		seq = read_seqbegin(&rename_lock);
 		for (;;) {
 			if (new_dentry != old_dentry) {
-				struct dentry * parent = new_dentry->d_parent;
-				if (parent == new_dentry)
+				if (IS_ROOT(new_dentry))
 					break;
-				new_dentry = parent;
+				new_dentry = new_dentry->d_parent;
 				continue;
 			}
 			result = 1;
diff --git a/fs/namei.c b/fs/namei.c
index 2b8f823eda4..068a9e50c8c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1454,7 +1454,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 
-	for (p = p1; p->d_parent != p; p = p->d_parent) {
+	for (p = p1; !IS_ROOT(p); p = p->d_parent) {
 		if (p->d_parent == p2) {
 			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
 			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -1462,7 +1462,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 		}
 	}
 
-	for (p = p2; p->d_parent != p; p = p->d_parent) {
+	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
 		if (p->d_parent == p1) {
 			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
 			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
-- 
cgit v1.2.3


From e2761a1167633ed943fea29002f990194923d060 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:28 +0900
Subject: [PATCH vfs-2.6 2/6] vfs: add d_ancestor()

This adds d_ancestor() instead of d_isparent(), then use it.

If new_dentry == old_dentry, is_subdir() returns 1, looks strange.
"new_dentry == old_dentry" is not subdir obviously. But I'm not
checking callers for now, so this keeps current behavior.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/dcache.c | 45 +++++++++++++++++++++++----------------------
 fs/namei.c  | 22 ++++++++++------------
 2 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c6fd1f27da5..64024005da4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1720,18 +1720,23 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dcache_lock);
 }
 
-/*
- * Helper that returns 1 if p1 is a parent of p2, else 0
+/**
+ * d_ancestor - search for an ancestor
+ * @p1: ancestor dentry
+ * @p2: child dentry
+ *
+ * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
+ * an ancestor of p2, else NULL.
  */
-static int d_isparent(struct dentry *p1, struct dentry *p2)
+struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 {
 	struct dentry *p;
 
 	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
 		if (p->d_parent == p1)
-			return 1;
+			return p;
 	}
-	return 0;
+	return NULL;
 }
 
 /*
@@ -1755,7 +1760,7 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
 
 	/* Check for loops */
 	ret = ERR_PTR(-ELOOP);
-	if (d_isparent(alias, dentry))
+	if (d_ancestor(alias, dentry))
 		goto out_err;
 
 	/* See lock_rename() */
@@ -2155,31 +2160,27 @@ out:
  * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
  */
   
-int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
+int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
 	int result;
-	struct dentry * saved = new_dentry;
 	unsigned long seq;
 
-	/* need rcu_readlock to protect against the d_parent trashing due to
-	 * d_move
+	/* FIXME: This is old behavior, needed? Please check callers. */
+	if (new_dentry == old_dentry)
+		return 1;
+
+	/*
+	 * Need rcu_readlock to protect against the d_parent trashing
+	 * due to d_move
 	 */
 	rcu_read_lock();
-        do {
+	do {
 		/* for restarting inner loop in case of seq retry */
-		new_dentry = saved;
-		result = 0;
 		seq = read_seqbegin(&rename_lock);
-		for (;;) {
-			if (new_dentry != old_dentry) {
-				if (IS_ROOT(new_dentry))
-					break;
-				new_dentry = new_dentry->d_parent;
-				continue;
-			}
+		if (d_ancestor(old_dentry, new_dentry))
 			result = 1;
-			break;
-		}
+		else
+			result = 0;
 	} while (read_seqretry(&rename_lock, seq));
 	rcu_read_unlock();
 
diff --git a/fs/namei.c b/fs/namei.c
index 068a9e50c8c..b7cd65224d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1454,20 +1454,18 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 
-	for (p = p1; !IS_ROOT(p); p = p->d_parent) {
-		if (p->d_parent == p2) {
-			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
-			return p;
-		}
+	p = d_ancestor(p2, p1);
+	if (p) {
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
 	}
 
-	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
-		if (p->d_parent == p1) {
-			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
-			return p;
-		}
+	p = d_ancestor(p1, p2);
+	if (p) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
 	}
 
 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-- 
cgit v1.2.3


From 360da90029196c9449bc61e5a07ce8404e4cba57 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:28 +0900
Subject: [PATCH vfs-2.6 3/6] vfs: add __d_instantiate() helper

This adds __d_instantiate() for users which is already taking
dcache_lock, and replace with it.

The part of d_add_ci() isn't equivalent. But it should be needed
fsnotify_d_instantiate() actually, because the path is to add the
inode to negative dentry.  fsnotify_d_instantiate() should be called
after change from negative to positive.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/dcache.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 64024005da4..5a24cee6b76 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -981,6 +981,15 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 	return d_alloc(parent, &q);
 }
 
+/* the caller must hold dcache_lock */
+static void __d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	if (inode)
+		list_add(&dentry->d_alias, &inode->i_dentry);
+	dentry->d_inode = inode;
+	fsnotify_d_instantiate(dentry, inode);
+}
+
 /**
  * d_instantiate - fill in inode information for a dentry
  * @entry: dentry to complete
@@ -1000,10 +1009,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 {
 	BUG_ON(!list_empty(&entry->d_alias));
 	spin_lock(&dcache_lock);
-	if (inode)
-		list_add(&entry->d_alias, &inode->i_dentry);
-	entry->d_inode = inode;
-	fsnotify_d_instantiate(entry, inode);
+	__d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
@@ -1033,7 +1039,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 	unsigned int hash = entry->d_name.hash;
 
 	if (!inode) {
-		entry->d_inode = NULL;
+		__d_instantiate(entry, NULL);
 		return NULL;
 	}
 
@@ -1052,9 +1058,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 		return alias;
 	}
 
-	list_add(&entry->d_alias, &inode->i_dentry);
-	entry->d_inode = inode;
-	fsnotify_d_instantiate(entry, inode);
+	__d_instantiate(entry, inode);
 	return NULL;
 }
 
@@ -1213,10 +1217,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-			/* d_instantiate takes dcache_lock, so we do it by hand */
-			list_add(&dentry->d_alias, &inode->i_dentry);
-			dentry->d_inode = inode;
-			fsnotify_d_instantiate(dentry, inode);
+			/* already taking dcache_lock, so d_add() by hand */
+			__d_instantiate(dentry, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
@@ -1299,8 +1301,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 		 * d_instantiate() by hand because it takes dcache_lock which
 		 * we already hold.
 		 */
-		list_add(&found->d_alias, &inode->i_dentry);
-		found->d_inode = inode;
+		__d_instantiate(found, inode);
 		spin_unlock(&dcache_lock);
 		security_d_instantiate(found, inode);
 		return found;
@@ -1833,7 +1834,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 
 	if (!inode) {
 		actual = dentry;
-		dentry->d_inode = NULL;
+		__d_instantiate(dentry, NULL);
 		goto found_lock;
 	}
 
-- 
cgit v1.2.3


From 8f3dfaa5bab767a043c5af5b879fb86c03329f8a Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:29 +0900
Subject: [PATCH vfs-2.6 4/6] vfs: remove unnecessary fsnotify_d_instantiate()

This calls d_move(), so fsnotify_d_instantiate() is unnecessary like
rename path.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/dcache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 5a24cee6b76..900de90d21b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1210,7 +1210,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-			fsnotify_d_instantiate(new, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_rehash(dentry);
-- 
cgit v1.2.3


From 0612d9fb270a474fe6a46cc5b8d3f5b71cf5f580 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:29 +0900
Subject: [PATCH vfs-2.6 5/6] vfs: remove LOOKUP_PARENT from non LOOKUP_PARENT
 lookup

lookup_hash() with LOOKUP_PARENT is bogus. And this prepares to add
new intent on those path.

The user of LOOKUP_PARENT intent is nfs only, and it checks whether
nd->flags has LOOKUP_CREATE or LOOKUP_OPEN, so the result is same.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/namei.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b7cd65224d6..18894fdf048 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2170,16 +2170,19 @@ static long do_rmdir(int dfd, const char __user *pathname)
 		return error;
 
 	switch(nd.last_type) {
-		case LAST_DOTDOT:
-			error = -ENOTEMPTY;
-			goto exit1;
-		case LAST_DOT:
-			error = -EINVAL;
-			goto exit1;
-		case LAST_ROOT:
-			error = -EBUSY;
-			goto exit1;
+	case LAST_DOTDOT:
+		error = -ENOTEMPTY;
+		goto exit1;
+	case LAST_DOT:
+		error = -EINVAL;
+		goto exit1;
+	case LAST_ROOT:
+		error = -EBUSY;
+		goto exit1;
 	}
+
+	nd.flags &= ~LOOKUP_PARENT;
+
 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
@@ -2257,6 +2260,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM)
 		goto exit1;
+
+	nd.flags &= ~LOOKUP_PARENT;
+
 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
@@ -2646,6 +2652,9 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
 	if (newnd.last_type != LAST_NORM)
 		goto exit2;
 
+	oldnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags &= ~LOOKUP_PARENT;
+
 	trap = lock_rename(new_dir, old_dir);
 
 	old_dentry = lookup_hash(&oldnd);
-- 
cgit v1.2.3


From 4e9ed2f85af7adfa7c3f0efa839a53186254fdcb Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 16 Oct 2008 07:50:29 +0900
Subject: [PATCH vfs-2.6 6/6] vfs: add LOOKUP_RENAME_TARGET intent

This adds LOOKUP_RENAME_TARGET intent for lookup of rename destination.

LOOKUP_RENAME_TARGET is going to be used like LOOKUP_CREATE. But since
the destination of rename() can be existing directory entry, so it has a
difference. Although that difference doesn't matter in my usage, this
tells it to user of this intent.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/namei.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 18894fdf048..9e2a534383d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2654,6 +2654,7 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
 
 	oldnd.flags &= ~LOOKUP_PARENT;
 	newnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags |= LOOKUP_RENAME_TARGET;
 
 	trap = lock_rename(new_dir, old_dir);
 
-- 
cgit v1.2.3


From 3222a3e55f4025acb2a5a4379cf2f2b7df1f1243 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 3 Sep 2008 21:53:01 +0200
Subject: [PATCH] fix ->llseek for more directories

With this patch all directory fops instances that have a readdir
that doesn't take the BKL are switched to generic_file_llseek.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/afs/dir.c          | 1 +
 fs/bfs/dir.c          | 1 +
 fs/cifs/cifsfs.c      | 1 +
 fs/fat/dir.c          | 1 +
 fs/jffs2/dir.c        | 3 ++-
 fs/jfs/namei.c        | 1 +
 fs/omfs/dir.c         | 1 +
 fs/openpromfs/inode.c | 1 +
 fs/proc/proc_sysctl.c | 1 +
 fs/sysfs/dir.c        | 1 +
 fs/ufs/dir.c          | 1 +
 11 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index dfda03d4397..99cf390641f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -45,6 +45,7 @@ const struct file_operations afs_dir_file_operations = {
 	.release	= afs_release,
 	.readdir	= afs_readdir,
 	.lock		= afs_lock,
+	.llseek		= generic_file_llseek,
 };
 
 const struct inode_operations afs_dir_inode_operations = {
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index ed8feb052df..daae463068e 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -80,6 +80,7 @@ const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
 	.fsync		= file_fsync,
+	.llseek		= generic_file_llseek,
 };
 
 extern void dump_imap(const char *, struct super_block *);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 25ecbd5b040..89c64a8dcb9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -765,6 +765,7 @@ const struct file_operations cifs_dir_ops = {
 	.dir_notify = cifs_dir_notify,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 	.unlocked_ioctl  = cifs_ioctl,
+	.llseek = generic_file_llseek,
 };
 
 static void
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index cd4a0162e10..bae1c329252 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -839,6 +839,7 @@ const struct file_operations fat_dir_operations = {
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
 	.fsync		= file_fsync,
+	.llseek		= generic_file_llseek,
 };
 
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 621bdfa994e..6f60cc910f4 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -39,7 +39,8 @@ const struct file_operations jffs2_dir_operations =
 	.read =		generic_read_dir,
 	.readdir =	jffs2_readdir,
 	.unlocked_ioctl=jffs2_ioctl,
-	.fsync =	jffs2_fsync
+	.fsync =	jffs2_fsync,
+	.llseek =	generic_file_llseek,
 };
 
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e199dde7b83..cc3cedffbfa 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1547,6 +1547,7 @@ const struct file_operations jfs_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= jfs_compat_ioctl,
 #endif
+	.llseek		= generic_file_llseek,
 };
 
 static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c0757e99887..c7275cfbdcf 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -501,4 +501,5 @@ struct inode_operations omfs_dir_inops = {
 struct file_operations omfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = omfs_readdir,
+	.llseek = generic_file_llseek,
 };
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 9f5b054f06b..d41bdc784de 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -167,6 +167,7 @@ static int openpromfs_readdir(struct file *, void *, filldir_t);
 static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
 	.readdir	= openpromfs_readdir,
+	.llseek		= generic_file_llseek,
 };
 
 static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 945a81043ba..5fe210c0917 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -353,6 +353,7 @@ static const struct file_operations proc_sys_file_operations = {
 
 static const struct file_operations proc_sys_dir_file_operations = {
 	.readdir	= proc_sys_readdir,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct inode_operations proc_sys_inode_operations = {
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 3a05a596e3b..82d3b79d0e0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -983,4 +983,5 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 const struct file_operations sysfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysfs_readdir,
+	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index df0bef18742..dbbbc466876 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -667,4 +667,5 @@ const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
 	.fsync		= file_fsync,
+	.llseek		= generic_file_llseek,
 };
-- 
cgit v1.2.3


From 91efc167d02509ea78abeff6d668065964c67c0b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Sep 2008 19:42:50 +0200
Subject: [PATCH] reiserfs: add missing llseek method

Reiserfs currently doesn't set a llseek method for regular files, which
means it will fall back to default_llseek.  This means no one can seek
beyond 2 Gigabytes on reiserfs, and that there's not protection vs
the i_size updates from writers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/reiserfs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index a804903d31d..33408417038 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -296,6 +296,7 @@ const struct file_operations reiserfs_file_operations = {
 	.aio_write = generic_file_aio_write,
 	.splice_read = generic_file_splice_read,
 	.splice_write = generic_file_splice_write,
+	.llseek = generic_file_llseek,
 };
 
 const struct inode_operations reiserfs_file_inode_operations = {
-- 
cgit v1.2.3


From 0e55a7cca4b66f625d67b292f80b6a976e77c51b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 26 Sep 2008 19:01:20 -0700
Subject: [RFC PATCH] touch_mnt_namespace when the mount flags change

Daemons that need to be launched while the rootfs is read-only can now
poll /proc/mounts to be notified when their O_RDWR requests may no
longer end in EROFS.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/namespace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index f527a0d6c64..cce46702d33 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1550,8 +1550,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	if (!err)
 		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
-	if (!err)
+	if (!err) {
 		security_sb_post_remount(path->mnt, flags, data);
+
+		spin_lock(&vfsmount_lock);
+		touch_mnt_namespace(path->mnt->mnt_ns);
+		spin_unlock(&vfsmount_lock);
+	}
 	return err;
 }
 
-- 
cgit v1.2.3


From 5cec56deb6d41b5b570306b17cd0b1590ebd0897 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Mon, 13 Oct 2008 18:32:42 +0800
Subject: [PATCH] fs/dcache.c: update comment of d_validate()

Parameters @hash and @len have been removed since 2.4.3,
now just to delete them.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
---
 fs/dcache.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 900de90d21b..12eac838558 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1462,8 +1462,6 @@ out:
  * d_validate - verify dentry provided from insecure source
  * @dentry: The dentry alleged to be valid child of @dparent
  * @dparent: The parent dentry (known to be valid)
- * @hash: Hash of the dentry
- * @len: Length of the name
  *
  * An insecure source has sent us a dentry, here we verify it and dget() it.
  * This is used by ncpfs in its readdir implementation.
-- 
cgit v1.2.3


From f696a3659fc4b3a3bf4bc83d9dbec5e5a2ffd929 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 31 Jul 2008 13:41:58 +0200
Subject: [PATCH] move executable checking into ->permission()

For execute permission on a regular files we need to check if file has
any execute bits at all, regardless of capabilites.

This check is normally performed by generic_permission() but was also
added to the case when the filesystem defines its own ->permission()
method.  In the latter case the filesystem should be responsible for
performing this check.

Move the check from inode_permission() inside filesystems which are
not calling generic_permission().

Create a helper function execute_ok() that returns true if the inode
is a directory or if any execute bits are present in i_mode.

Also fix up the following code:

 - coda control file is never executable
 - sysctl files are never executable
 - hfs_permission seems broken on MAY_EXEC, remove
 - hfsplus_permission is eqivalent to generic_permission(), remove

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/cifs/cifsfs.c      |  9 ++++++---
 fs/coda/dir.c         |  3 +++
 fs/coda/pioctl.c      |  2 +-
 fs/hfs/inode.c        |  8 --------
 fs/hfsplus/inode.c    | 13 -------------
 fs/namei.c            | 21 ++++-----------------
 fs/nfs/dir.c          |  3 +++
 fs/proc/proc_sysctl.c | 10 ++++++++--
 8 files changed, 25 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 89c64a8dcb9..84cc011a16e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -275,9 +275,12 @@ static int cifs_permission(struct inode *inode, int mask)
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
-		return 0;
-	else /* file mode might have been restricted at mount time
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
+		if ((mask & MAY_EXEC) && !execute_ok(inode))
+			return -EACCES;
+		else
+			return 0;
+	} else /* file mode might have been restricted at mount time
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index c5916228243..75b1fa90b2c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -146,6 +146,9 @@ int coda_permission(struct inode *inode, int mask)
 	if (!mask)
 		return 0; 
 
+	if ((mask & MAY_EXEC) && !execute_ok(inode))
+		return -EACCES;
+
 	lock_kernel();
 
 	if (coda_cache_check(inode, mask))
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c51365422aa..773f2ce9aa0 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -43,7 +43,7 @@ const struct file_operations coda_ioctl_operations = {
 /* the coda pioctl inode ops */
 static int coda_ioctl_permission(struct inode *inode, int mask)
 {
-        return 0;
+	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
 static int coda_pioctl(struct inode * inode, struct file * filp, 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 7e19835efa2..c69b7ac75bf 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -511,13 +511,6 @@ void hfs_clear_inode(struct inode *inode)
 	}
 }
 
-static int hfs_permission(struct inode *inode, int mask)
-{
-	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
-		return 0;
-	return generic_permission(inode, mask, NULL);
-}
-
 static int hfs_file_open(struct inode *inode, struct file *file)
 {
 	if (HFS_IS_RSRC(inode))
@@ -616,7 +609,6 @@ static const struct inode_operations hfs_file_inode_operations = {
 	.lookup		= hfs_file_lookup,
 	.truncate	= hfs_file_truncate,
 	.setattr	= hfs_inode_setattr,
-	.permission	= hfs_permission,
 	.setxattr	= hfs_setxattr,
 	.getxattr	= hfs_getxattr,
 	.listxattr	= hfs_listxattr,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 963be644297..b207f0e6fc2 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -238,18 +238,6 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
 }
 
-static int hfsplus_permission(struct inode *inode, int mask)
-{
-	/* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
-	 * open_exec has the same test, so it's still not executable, if a x bit
-	 * is set fall back to standard permission check.
-	 */
-	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111))
-		return 0;
-	return generic_permission(inode, mask, NULL);
-}
-
-
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
@@ -281,7 +269,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
-	.permission	= hfsplus_permission,
 	.setxattr	= hfsplus_setxattr,
 	.getxattr	= hfsplus_getxattr,
 	.listxattr	= hfsplus_listxattr,
diff --git a/fs/namei.c b/fs/namei.c
index 9e2a534383d..09ce58e49e7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -212,8 +212,7 @@ int generic_permission(struct inode *inode, int mask,
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
 	 */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+	if (!(mask & MAY_EXEC) || execute_ok(inode))
 		if (capable(CAP_DAC_OVERRIDE))
 			return 0;
 
@@ -249,23 +248,11 @@ int inode_permission(struct inode *inode, int mask)
 	}
 
 	/* Ordinary permission routines do not understand MAY_APPEND. */
-	if (inode->i_op && inode->i_op->permission) {
+	if (inode->i_op && inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
-		if (!retval) {
-			/*
-			 * Exec permission on a regular file is denied if none
-			 * of the execute bits are set.
-			 *
-			 * This check should be done by the ->permission()
-			 * method.
-			 */
-			if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode) &&
-			    !(inode->i_mode & S_IXUGO))
-				return -EACCES;
-		}
-	} else {
+	else
 		retval = generic_permission(inode, mask, NULL);
-	}
+
 	if (retval)
 		return retval;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c216c8786c5..3e64b98f3a9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1957,6 +1957,9 @@ force_lookup:
 	} else
 		res = PTR_ERR(cred);
 out:
+	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
+		res = -EACCES;
+
 	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
 		inode->i_sb->s_id, inode->i_ino, mask, res);
 	return res;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5fe210c0917..7b997754a25 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -298,13 +298,19 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	 * sysctl entries that are not writeable,
 	 * are _NOT_ writeable, capabilities or not.
 	 */
-	struct ctl_table_header *head = grab_header(inode);
-	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_header *head;
+	struct ctl_table *table;
 	int error;
 
+	/* Executable files are not allowed under /proc/sys/ */
+	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
+		return -EACCES;
+
+	head = grab_header(inode);
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
+	table = PROC_I(inode)->sysctl_entry;
 	if (!table) /* global root - r-xr-xr-x */
 		error = mask & MAY_WRITE ? -EACCES : 0;
 	else /* Use the permissions on the sysctl table entry */
-- 
cgit v1.2.3


From 2c512397ca060f6dbcb3957174a91e29a3b769be Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 20 Aug 2008 16:56:22 -0700
Subject: [patch 1/3] FS_MBCACHE: don't needlessly make it built-in

Assume you have:
- one or more of ext2/3/4 statically built into your kernel
- none of these with extended attributes enabled and
- want to add onother one of ext2/3/4 modular and with
  extended attributes enabled

then you currently have to reboot to use it since this results in
CONFIG_FS_MBCACHE=y.

That's not a common issue, but I just ran into it and since there's no
reason to get a built-in mbcache in this case this patch fixes it.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Andreas Gruenbacher <agruen@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/Kconfig | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e46297f020c..522469a7eca 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -22,9 +22,10 @@ source "fs/jbd2/Kconfig"
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
 	tristate
-	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
-	default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
-	default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
+	default y if EXT2_FS=y && EXT2_FS_XATTR
+	default y if EXT3_FS=y && EXT3_FS_XATTR
+	default y if EXT4_FS=y && EXT4_FS_XATTR
+	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
 config REISERFS_FS
 	tristate "Reiserfs support"
-- 
cgit v1.2.3


From a77b72da24ecfe262760874c55e3f6461f1dec0d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 30 Jul 2008 14:06:22 +0200
Subject: [patch] vfs: make security_inode_setattr() calling consistent

Call security_inode_setattr() consistetly before inode_change_ok().
It doesn't make sense to try to "optimize" the i_op->setattr == NULL
case, as most filesystem do define their own setattr function.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/attr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 26c71ba1eed..7a83819f6ba 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -159,17 +159,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
 		return 0;
 
+	error = security_inode_setattr(dentry, attr);
+	if (error)
+		return error;
+
 	if (ia_valid & ATTR_SIZE)
 		down_write(&dentry->d_inode->i_alloc_sem);
 
 	if (inode->i_op && inode->i_op->setattr) {
-		error = security_inode_setattr(dentry, attr);
-		if (!error)
-			error = inode->i_op->setattr(dentry, attr);
+		error = inode->i_op->setattr(dentry, attr);
 	} else {
 		error = inode_change_ok(inode, attr);
-		if (!error)
-			error = security_inode_setattr(dentry, attr);
 		if (!error) {
 			if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 			    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
-- 
cgit v1.2.3


From fd217f4d70172c526478f2bc76859e909fdfa674 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 21 Oct 2008 06:47:33 -0700
Subject: [PATCH] fs: add a sanity check in d_free

Hi Al,

remember that debug session we did at KS? You suggested this patch back
then....

From 7751eaf30474b8cbfaea64795805a17eab05ac53 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 16 Sep 2008 16:51:17 -0700
Subject: [PATCH] fs: add a sanity check in d_free

we're seeing some corruption in the dentry->d_alias list that
appears like a free of an entry still on the list; this patch
adds a WARN_ON() to catch this scenario, as suggested by Al Viro

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/dcache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 12eac838558..a1d86c7f3e6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -69,6 +69,7 @@ struct dentry_stat_t dentry_stat = {
 
 static void __d_free(struct dentry *dentry)
 {
+	WARN_ON(!list_empty(&dentry->d_alias));
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
 	kmem_cache_free(dentry_cache, dentry); 
-- 
cgit v1.2.3


From 7c88db0cb589df980acfb2f73c3595a0653004ec Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Thu, 16 Oct 2008 15:27:09 +0400
Subject: proc: fix vma display mismatch between /proc/pid/{maps,smaps}

Commit 4752c369789250eafcd7813e11c8fb689235b0d2 aka
"maps4: simplify interdependence of maps and smaps" broke /proc/pid/smaps,
causing it to display some vmas twice and other vmas not at all.  For example:

    grep .- /proc/1/smaps >/tmp/smaps; diff /proc/1/maps /tmp/smaps

    1  25d24
    2  < 7fd7e23aa000-7fd7e23ac000 rw-p 7fd7e23aa000 00:00 0
    3  28a28
    4  > ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0  [vsyscall]

The bug has something to do with setting m->version before all the
seq_printf's have been performed.  show_map was doing this correctly,
but show_smap was doing this in the middle of its seq_printf sequence.
This patch arranges things so that the setting of m->version in show_smap
is also done at the end of its seq_printf sequence.

Testing: in addition to the above grep test, for each process I summed
up the 'Rss' fields of /proc/pid/smaps and compared that to the 'VmRSS'
field of /proc/pid/status.  All matched except for Xorg (which has a
/dev/mem mapping which Rss accounts for but VmRSS does not).  This result
gives us some confidence that neither /proc/pid/maps nor /proc/pid/smaps
are any longer skipping or double-counting vmas.

Signed-off-by: Joe Korty <joe.korty@ccur.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/task_mmu.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4806830ea2a..b770c095e45 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -198,11 +198,8 @@ static int do_maps_open(struct inode *inode, struct file *file,
 	return ret;
 }
 
-static int show_map(struct seq_file *m, void *v)
+static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
-	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
-	struct vm_area_struct *vma = v;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
 	int flags = vma->vm_flags;
@@ -254,6 +251,15 @@ static int show_map(struct seq_file *m, void *v)
 		}
 	}
 	seq_putc(m, '\n');
+}
+
+static int show_map(struct seq_file *m, void *v)
+{
+	struct vm_area_struct *vma = v;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
+
+	show_map_vma(m, vma);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -364,9 +370,10 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 static int show_smap(struct seq_file *m, void *v)
 {
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
-	int ret;
 	struct mm_walk smaps_walk = {
 		.pmd_entry = smaps_pte_range,
 		.mm = vma->vm_mm,
@@ -378,9 +385,7 @@ static int show_smap(struct seq_file *m, void *v)
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
-	ret = show_map(m, v);
-	if (ret)
-		return ret;
+	show_map_vma(m, vma);
 
 	seq_printf(m,
 		   "Size:           %8lu kB\n"
@@ -402,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.referenced >> 10,
 		   mss.swap >> 10);
 
-	return ret;
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
 }
 
 static const struct seq_operations proc_pid_smaps_op = {
-- 
cgit v1.2.3


From 5bcd7ff9e1690dbdbccb2a1cb3c2ea8b8381c435 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 17 Oct 2008 03:43:55 +0400
Subject: proc: proc_init_inodecache() can't fail

kmem_cache creation code will panic, don't return anything.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/inode.c    | 3 +--
 fs/proc/internal.h | 2 +-
 fs/proc/root.c     | 6 +++---
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c6b4fa7e3b4..2543fd00c65 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -106,14 +106,13 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-int __init proc_init_inodecache(void)
+void __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD|SLAB_PANIC),
 					     init_once);
-	return 0;
 }
 
 static const struct super_operations proc_sops = {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3bfb7b8747b..d5bf4dec6b0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -66,7 +66,7 @@ extern const struct inode_operations proc_net_inode_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
 
-int proc_init_inodecache(void);
+void proc_init_inodecache(void);
 
 static inline struct pid *proc_pid(struct inode *inode)
 {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 95117538a4f..2a3abd25b30 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -104,9 +104,9 @@ static struct file_system_type proc_fs_type = {
 
 void __init proc_root_init(void)
 {
-	int err = proc_init_inodecache();
-	if (err)
-		return;
+	int err;
+
+	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
-- 
cgit v1.2.3


From 1e0edd3f67d5b5c04ef6c0908aac8bd70dffc6f6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 17 Oct 2008 05:07:44 +0400
Subject: proc: spread __init

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_devtree.c | 3 ++-
 fs/proc/proc_sysctl.c  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index eca471bc851..d777789b7a8 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -4,6 +4,7 @@
  * Copyright 1997 Paul Mackerras
  */
 #include <linux/errno.h>
+#include <linux/init.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
@@ -214,7 +215,7 @@ void proc_device_tree_add_node(struct device_node *np,
 /*
  * Called on initialization to set up the /proc/device-tree subtree
  */
-void proc_device_tree_init(void)
+void __init proc_device_tree_init(void)
 {
 	struct device_node *root;
 	if ( !have_of )
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 945a81043ba..41b5063e28d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1,7 +1,7 @@
 /*
  * /proc/sys support
  */
-
+#include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
@@ -395,7 +395,7 @@ static struct dentry_operations proc_sys_dentry_operations = {
 	.d_compare	= proc_sys_compare,
 };
 
-int proc_sys_init(void)
+int __init proc_sys_init(void)
 {
 	struct proc_dir_entry *proc_sys_root;
 
-- 
cgit v1.2.3


From 6c2f91e077f1b60e7f83b7ee044f965f469cfdb3 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 13 Sep 2008 19:51:30 -0700
Subject: proc: use WARN() rather than printk+backtrace

Use WARN() rather than a printk() + backtrace();
this gives a more standard format message as well as complete
information (including line numbers etc) that will be collected
by kerneloops.org

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/generic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7821589a17d..60a359b3558 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -547,9 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 
 	for (tmp = dir->subdir; tmp; tmp = tmp->next)
 		if (strcmp(tmp->name, dp->name) == 0) {
-			printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+			WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
 				dir->name, dp->name);
-			dump_stack();
 			break;
 		}
 
-- 
cgit v1.2.3


From 5b3acc8de8b2bc459afae6e09ada45c7e5b11bbf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 3 Oct 2008 02:21:47 +0400
Subject: proc: switch /proc/loadavg to seq_file

and move it to fs/proc/loadavg.c while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  2 +-
 fs/proc/loadavg.c   | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c | 26 --------------------------
 3 files changed, 52 insertions(+), 27 deletions(-)
 create mode 100644 fs/proc/loadavg.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ebaba021354..bf2ab4df4a8 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -9,7 +9,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
-
+proc-y	+= loadavg.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
new file mode 100644
index 00000000000..9bca39cf99e
--- /dev/null
+++ b/fs/proc/loadavg.c
@@ -0,0 +1,51 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/time.h>
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static int loadavg_proc_show(struct seq_file *m, void *v)
+{
+	int a, b, c;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		a = avenrun[0] + (FIXED_1/200);
+		b = avenrun[1] + (FIXED_1/200);
+		c = avenrun[2] + (FIXED_1/200);
+	} while (read_seqretry(&xtime_lock, seq));
+
+	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		nr_running(), nr_threads,
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+static int loadavg_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, loadavg_proc_show, NULL);
+}
+
+static const struct file_operations loadavg_proc_fops = {
+	.open		= loadavg_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_loadavg_init(void)
+{
+	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	return 0;
+}
+module_init(proc_loadavg_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7ea52c79b2d..ff42206c8ae 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,8 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 /*
  * Warning: stuff below (imported functions) assumes that its output will fit
  * into one page. For some of those functions it may be wrong. Moreover, we
@@ -80,29 +78,6 @@ static int proc_calc_metrics(char *page, char **start, off_t off,
 	return len;
 }
 
-static int loadavg_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int a, b, c;
-	int len;
-	unsigned long seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
-		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 static int uptime_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -861,7 +836,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"loadavg",     loadavg_read_proc},
 		{"uptime",	uptime_read_proc},
 		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
-- 
cgit v1.2.3


From 9617760287eec9091d26e6967bd3e4194de18f97 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 3 Oct 2008 02:38:18 +0400
Subject: proc: switch /proc/uptime to seq_file

and move it to fs/proc/uptime.c while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  1 +
 fs/proc/proc_misc.c | 21 ---------------------
 fs/proc/uptime.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 21 deletions(-)
 create mode 100644 fs/proc/uptime.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bf2ab4df4a8..27efa14963b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
 proc-y	+= loadavg.o
+proc-y	+= uptime.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ff42206c8ae..484b6011bf0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -78,26 +78,6 @@ static int proc_calc_metrics(char *page, char **start, off_t off,
 	return len;
 }
 
-static int uptime_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	struct timespec uptime;
-	struct timespec idle;
-	int len;
-	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
-
-	do_posix_clock_monotonic_gettime(&uptime);
-	monotonic_to_bootbased(&uptime);
-	cputime_to_timespec(idletime, &idle);
-	len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
-			(unsigned long) uptime.tv_sec,
-			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
-			(unsigned long) idle.tv_sec,
-			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 int __attribute__((weak)) arch_report_meminfo(char *page)
 {
 	return 0;
@@ -836,7 +816,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"uptime",	uptime_read_proc},
 		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
 #ifdef CONFIG_PROC_HARDWARE
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
new file mode 100644
index 00000000000..0c10a0b3f14
--- /dev/null
+++ b/fs/proc/uptime.c
@@ -0,0 +1,43 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <asm/cputime.h>
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	cputime_to_timespec(idletime, &idle);
+	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime.tv_sec,
+			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
+			(unsigned long) idle.tv_sec,
+			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
+	return 0;
+}
+
+static int uptime_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uptime_proc_show, NULL);
+}
+
+static const struct file_operations uptime_proc_fops = {
+	.open		= uptime_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_uptime_init(void)
+{
+	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	return 0;
+}
+module_init(proc_uptime_init);
-- 
cgit v1.2.3


From e1759c215bee5abbcb6cb066590ab20905154ed5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Oct 2008 23:50:22 +0400
Subject: proc: switch /proc/meminfo to seq_file

and move it to fs/proc/meminfo.c while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |   1 +
 fs/proc/meminfo.c   | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c | 137 ------------------------------------------
 3 files changed, 169 insertions(+), 137 deletions(-)
 create mode 100644 fs/proc/meminfo.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 27efa14963b..70607a03839 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
 proc-y	+= loadavg.o
+proc-y	+= meminfo.o
 proc-y	+= uptime.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
new file mode 100644
index 00000000000..b1675c4e66d
--- /dev/null
+++ b/fs/proc/meminfo.c
@@ -0,0 +1,168 @@
+#include <linux/fs.h>
+#include <linux/hugetlb.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/quicklist.h>
+#include <linux/seq_file.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include "internal.h"
+
+void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
+{
+}
+
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	struct sysinfo i;
+	unsigned long committed;
+	unsigned long allowed;
+	struct vmalloc_info vmi;
+	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
+
+/*
+ * display in kilobytes.
+ */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	si_meminfo(&i);
+	si_swapinfo(&i);
+	committed = atomic_long_read(&vm_committed_space);
+	allowed = ((totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages - i.bufferram;
+	if (cached < 0)
+		cached = 0;
+
+	get_vmalloc_info(&vmi);
+
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+	/*
+	 * Tagged format, for easy grepping and expansion.
+	 */
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+#endif
+#ifdef CONFIG_HIGHMEM
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
+#endif
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"PageTables:     %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+		"Quicklists:     %8lu kB\n"
+#endif
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n",
+		K(i.totalram),
+		K(i.freeram),
+		K(i.bufferram),
+		K(cached),
+		K(total_swapcache_pages),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
+#endif
+#ifdef CONFIG_HIGHMEM
+		K(i.totalhigh),
+		K(i.freehigh),
+		K(i.totalram-i.totalhigh),
+		K(i.freeram-i.freehigh),
+#endif
+		K(i.totalswap),
+		K(i.freeswap),
+		K(global_page_state(NR_FILE_DIRTY)),
+		K(global_page_state(NR_WRITEBACK)),
+		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+		K(quicklist_total_size()),
+#endif
+		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
+		K(global_page_state(NR_WRITEBACK_TEMP)),
+		K(allowed),
+		K(committed),
+		(unsigned long)VMALLOC_TOTAL >> 10,
+		vmi.used >> 10,
+		vmi.largest_chunk >> 10
+		);
+
+	hugetlb_report_meminfo(m);
+
+	arch_report_meminfo(m);
+
+	return 0;
+#undef K
+}
+
+static int meminfo_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, meminfo_proc_show, NULL);
+}
+
+static const struct file_operations meminfo_proc_fops = {
+	.open		= meminfo_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_meminfo_init(void)
+{
+	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	return 0;
+}
+module_init(proc_meminfo_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 484b6011bf0..1aba51b0a0c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -78,142 +78,6 @@ static int proc_calc_metrics(char *page, char **start, off_t off,
 	return len;
 }
 
-int __attribute__((weak)) arch_report_meminfo(char *page)
-{
-	return 0;
-}
-
-static int meminfo_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	struct sysinfo i;
-	int len;
-	unsigned long committed;
-	unsigned long allowed;
-	struct vmalloc_info vmi;
-	long cached;
-	unsigned long pages[NR_LRU_LISTS];
-	int lru;
-
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-	si_meminfo(&i);
-	si_swapinfo(&i);
-	committed = atomic_long_read(&vm_committed_space);
-	allowed = ((totalram_pages - hugetlb_total_pages())
-		* sysctl_overcommit_ratio / 100) + total_swap_pages;
-
-	cached = global_page_state(NR_FILE_PAGES) -
-			total_swapcache_pages - i.bufferram;
-	if (cached < 0)
-		cached = 0;
-
-	get_vmalloc_info(&vmi);
-
-	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
-		pages[lru] = global_page_state(NR_LRU_BASE + lru);
-
-	/*
-	 * Tagged format, for easy grepping and expansion.
-	 */
-	len = sprintf(page,
-		"MemTotal:       %8lu kB\n"
-		"MemFree:        %8lu kB\n"
-		"Buffers:        %8lu kB\n"
-		"Cached:         %8lu kB\n"
-		"SwapCached:     %8lu kB\n"
-		"Active:         %8lu kB\n"
-		"Inactive:       %8lu kB\n"
-		"Active(anon):   %8lu kB\n"
-		"Inactive(anon): %8lu kB\n"
-		"Active(file):   %8lu kB\n"
-		"Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
-		"Unevictable:    %8lu kB\n"
-		"Mlocked:        %8lu kB\n"
-#endif
-#ifdef CONFIG_HIGHMEM
-		"HighTotal:      %8lu kB\n"
-		"HighFree:       %8lu kB\n"
-		"LowTotal:       %8lu kB\n"
-		"LowFree:        %8lu kB\n"
-#endif
-		"SwapTotal:      %8lu kB\n"
-		"SwapFree:       %8lu kB\n"
-		"Dirty:          %8lu kB\n"
-		"Writeback:      %8lu kB\n"
-		"AnonPages:      %8lu kB\n"
-		"Mapped:         %8lu kB\n"
-		"Slab:           %8lu kB\n"
-		"SReclaimable:   %8lu kB\n"
-		"SUnreclaim:     %8lu kB\n"
-		"PageTables:     %8lu kB\n"
-#ifdef CONFIG_QUICKLIST
-		"Quicklists:     %8lu kB\n"
-#endif
-		"NFS_Unstable:   %8lu kB\n"
-		"Bounce:         %8lu kB\n"
-		"WritebackTmp:   %8lu kB\n"
-		"CommitLimit:    %8lu kB\n"
-		"Committed_AS:   %8lu kB\n"
-		"VmallocTotal:   %8lu kB\n"
-		"VmallocUsed:    %8lu kB\n"
-		"VmallocChunk:   %8lu kB\n",
-		K(i.totalram),
-		K(i.freeram),
-		K(i.bufferram),
-		K(cached),
-		K(total_swapcache_pages),
-		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
-		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
-		K(pages[LRU_ACTIVE_ANON]),
-		K(pages[LRU_INACTIVE_ANON]),
-		K(pages[LRU_ACTIVE_FILE]),
-		K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
-		K(pages[LRU_UNEVICTABLE]),
-		K(global_page_state(NR_MLOCK)),
-#endif
-#ifdef CONFIG_HIGHMEM
-		K(i.totalhigh),
-		K(i.freehigh),
-		K(i.totalram-i.totalhigh),
-		K(i.freeram-i.freehigh),
-#endif
-		K(i.totalswap),
-		K(i.freeswap),
-		K(global_page_state(NR_FILE_DIRTY)),
-		K(global_page_state(NR_WRITEBACK)),
-		K(global_page_state(NR_ANON_PAGES)),
-		K(global_page_state(NR_FILE_MAPPED)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE) +
-				global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE)),
-		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_PAGETABLE)),
-#ifdef CONFIG_QUICKLIST
-		K(quicklist_total_size()),
-#endif
-		K(global_page_state(NR_UNSTABLE_NFS)),
-		K(global_page_state(NR_BOUNCE)),
-		K(global_page_state(NR_WRITEBACK_TEMP)),
-		K(allowed),
-		K(committed),
-		(unsigned long)VMALLOC_TOTAL >> 10,
-		vmi.used >> 10,
-		vmi.largest_chunk >> 10
-		);
-
-		len += hugetlb_report_meminfo(page + len);
-
-	len += arch_report_meminfo(page + len);
-
-	return proc_calc_metrics(page, start, off, count, eof, len);
-#undef K
-}
-
 static int fragmentation_open(struct inode *inode, struct file *file)
 {
 	(void)inode;
@@ -816,7 +680,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"meminfo",	meminfo_read_proc},
 		{"version",	version_read_proc},
 #ifdef CONFIG_PROC_HARDWARE
 		{"hardware",	hardware_read_proc},
-- 
cgit v1.2.3


From b457d151613873ea035de0c7348abc3d4b6efd34 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 3 Oct 2008 11:53:19 +0400
Subject: proc: switch /proc/version to seq_file

and move it to fs/proc/version.c while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  1 +
 fs/proc/proc_misc.c | 13 -------------
 fs/proc/version.c   | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 13 deletions(-)
 create mode 100644 fs/proc/version.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 70607a03839..97985c848ac 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -12,6 +12,7 @@ proc-y       += inode.o root.o base.o generic.o array.o \
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
 proc-y	+= uptime.o
+proc-y	+= version.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1aba51b0a0c..4814b111d83 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -115,18 +115,6 @@ static const struct file_operations proc_zoneinfo_file_operations = {
 	.release	= seq_release,
 };
 
-static int version_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len;
-
-	len = snprintf(page, PAGE_SIZE, linux_proc_banner,
-		utsname()->sysname,
-		utsname()->release,
-		utsname()->version);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 extern const struct seq_operations cpuinfo_op;
 static int cpuinfo_open(struct inode *inode, struct file *file)
 {
@@ -680,7 +668,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"version",	version_read_proc},
 #ifdef CONFIG_PROC_HARDWARE
 		{"hardware",	hardware_read_proc},
 #endif
diff --git a/fs/proc/version.c b/fs/proc/version.c
new file mode 100644
index 00000000000..76817a60678
--- /dev/null
+++ b/fs/proc/version.c
@@ -0,0 +1,34 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/utsname.h>
+
+static int version_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, linux_proc_banner,
+		utsname()->sysname,
+		utsname()->release,
+		utsname()->version);
+	return 0;
+}
+
+static int version_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, version_proc_show, NULL);
+}
+
+static const struct file_operations version_proc_fops = {
+	.open		= version_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_version_init(void)
+{
+	proc_create("version", 0, NULL, &version_proc_fops);
+	return 0;
+}
+module_init(proc_version_init);
-- 
cgit v1.2.3


From 813dcf7a6e642feb1ea566b96ce2912249d2b57d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 3 Oct 2008 22:42:36 +0400
Subject: proc: move /proc/hardware to m68k-specific code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 4814b111d83..542527f6630 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -63,7 +63,6 @@
  * have a way to deal with that gracefully. Right now I used straightforward
  * wrappers, but this needs further analysis wrt potential overflows.
  */
-extern int get_hardware_list(char *);
 extern int get_stram_list(char *);
 extern int get_exec_domain_list(char *);
 
@@ -198,15 +197,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 	.release	= seq_release,
 };
 
-#ifdef CONFIG_PROC_HARDWARE
-static int hardware_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_hardware_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-#endif
-
 #ifdef CONFIG_STRAM_PROC
 static int stram_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -668,9 +658,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-#ifdef CONFIG_PROC_HARDWARE
-		{"hardware",	hardware_read_proc},
-#endif
 #ifdef CONFIG_STRAM_PROC
 		{"stram",	stram_read_proc},
 #endif
-- 
cgit v1.2.3


From 4c150f6c30f5129bbce5c41568a285b1f7ca8d8b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 13:49:34 +0400
Subject: proc: move /proc/stram to m68k-specific code

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 542527f6630..ec37d3aeaf1 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -63,7 +63,6 @@
  * have a way to deal with that gracefully. Right now I used straightforward
  * wrappers, but this needs further analysis wrt potential overflows.
  */
-extern int get_stram_list(char *);
 extern int get_exec_domain_list(char *);
 
 static int proc_calc_metrics(char *page, char **start, off_t off,
@@ -197,15 +196,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 	.release	= seq_release,
 };
 
-#ifdef CONFIG_STRAM_PROC
-static int stram_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_stram_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-#endif
-
 #ifdef CONFIG_BLOCK
 static int partitions_open(struct inode *inode, struct file *file)
 {
@@ -658,9 +648,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-#ifdef CONFIG_STRAM_PROC
-		{"stram",	stram_read_proc},
-#endif
 		{"filesystems",	filesystems_read_proc},
 		{"cmdline",	cmdline_read_proc},
 		{"execdomains",	execdomains_read_proc},
-- 
cgit v1.2.3


From 6827400713fa22312ca3b4f47b0e64871c88040c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 14:08:37 +0400
Subject: proc: move /proc/filesystems to fs/filesystems.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/filesystems.c    | 39 +++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c |  8 --------
 2 files changed, 39 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index f37f8726283..d0e20ced62d 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -8,6 +8,8 @@
 
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
@@ -214,6 +216,43 @@ int get_filesystem_list(char * buf)
 	return len;
 }
 
+#ifdef CONFIG_PROC_FS
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_system_type * tmp;
+
+	read_lock(&file_systems_lock);
+	tmp = file_systems;
+	while (tmp) {
+		seq_printf(m, "%s\t%s\n",
+			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			tmp->name);
+		tmp = tmp->next;
+	}
+	read_unlock(&file_systems_lock);
+	return 0;
+}
+
+static int filesystems_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, filesystems_proc_show, NULL);
+}
+
+static const struct file_operations filesystems_proc_fops = {
+	.open		= filesystems_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_filesystems_init(void)
+{
+	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	return 0;
+}
+module_init(proc_filesystems_init);
+#endif
+
 struct file_system_type *get_fs_type(const char *name)
 {
 	struct file_system_type *fs;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ec37d3aeaf1..9fa20f157cf 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -472,13 +472,6 @@ static const struct file_operations proc_interrupts_operations = {
 	.release	= seq_release,
 };
 
-static int filesystems_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_filesystem_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 static int cmdline_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -648,7 +641,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"filesystems",	filesystems_read_proc},
 		{"cmdline",	cmdline_read_proc},
 		{"execdomains",	execdomains_read_proc},
 		{NULL,}
-- 
cgit v1.2.3


From cf9887f102541b8a0adb73f7da9c28d090622010 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 14:13:59 +0400
Subject: proc: switch /proc/cmdline to seq_file

and move it to fs/proc/cmdline.c while I'm at it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  1 +
 fs/proc/cmdline.c   | 29 +++++++++++++++++++++++++++++
 fs/proc/proc_misc.c | 10 ----------
 3 files changed, 30 insertions(+), 10 deletions(-)
 create mode 100644 fs/proc/cmdline.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 97985c848ac..48f9f0f121b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -9,6 +9,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
+proc-y	+= cmdline.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
 proc-y	+= uptime.o
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
new file mode 100644
index 00000000000..82676e3fcd1
--- /dev/null
+++ b/fs/proc/cmdline.c
@@ -0,0 +1,29 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int cmdline_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", saved_command_line);
+	return 0;
+}
+
+static int cmdline_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cmdline_proc_show, NULL);
+}
+
+static const struct file_operations cmdline_proc_fops = {
+	.open		= cmdline_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_cmdline_init(void)
+{
+	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	return 0;
+}
+module_init(proc_cmdline_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 9fa20f157cf..15257d4b1b9 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -472,15 +472,6 @@ static const struct file_operations proc_interrupts_operations = {
 	.release	= seq_release,
 };
 
-static int cmdline_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%s\n", saved_command_line);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 #ifdef CONFIG_FILE_LOCKING
 static int locks_open(struct inode *inode, struct file *filp)
 {
@@ -641,7 +632,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"cmdline",	cmdline_read_proc},
 		{"execdomains",	execdomains_read_proc},
 		{NULL,}
 	};
-- 
cgit v1.2.3


From 6e62775ece1c83a84d86df44cfd8ea3e6c96ce23 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 14:28:09 +0400
Subject: proc: move /proc/execdomains to kernel/exec_domain.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 15257d4b1b9..424b9d03b1a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -63,7 +63,6 @@
  * have a way to deal with that gracefully. Right now I used straightforward
  * wrappers, but this needs further analysis wrt potential overflows.
  */
-extern int get_exec_domain_list(char *);
 
 static int proc_calc_metrics(char *page, char **start, off_t off,
 				 int count, int *eof, int len)
@@ -486,13 +485,6 @@ static const struct file_operations proc_locks_operations = {
 };
 #endif /* CONFIG_FILE_LOCKING */
 
-static int execdomains_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_exec_domain_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -632,7 +624,6 @@ void __init proc_misc_init(void)
 		char *name;
 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
 	} *p, simple_ones[] = {
-		{"execdomains",	execdomains_read_proc},
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
-- 
cgit v1.2.3


From 659689280ad91d31235db79cda6c7c799c4d3781 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 14:30:53 +0400
Subject: proc: remove remnants of ->read_proc in proc_misc.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 424b9d03b1a..e177f42496c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,24 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-/*
- * Warning: stuff below (imported functions) assumes that its output will fit
- * into one page. For some of those functions it may be wrong. Moreover, we
- * have a way to deal with that gracefully. Right now I used straightforward
- * wrappers, but this needs further analysis wrt potential overflows.
- */
-
-static int proc_calc_metrics(char *page, char **start, off_t off,
-				 int count, int *eof, int len)
-{
-	if (len <= off+count) *eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len>count) len = count;
-	if (len<0) len = 0;
-	return len;
-}
-
 static int fragmentation_open(struct inode *inode, struct file *file)
 {
 	(void)inode;
@@ -620,15 +602,6 @@ struct proc_dir_entry *proc_root_kcore;
 
 void __init proc_misc_init(void)
 {
-	static struct {
-		char *name;
-		int (*read_proc)(char*,char**,off_t,int,int*,void*);
-	} *p, simple_ones[] = {
-		{NULL,}
-	};
-	for (p = simple_ones; p->name; p++)
-		create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);
-
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-- 
cgit v1.2.3


From ae048112c099b0f3adb57f7c0b3a49bc62244609 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 14:39:12 +0400
Subject: proc: move /proc/kmsg creation to fs/proc/kmsg.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/internal.h  |  1 -
 fs/proc/kmsg.c      | 12 +++++++++---
 fs/proc/proc_misc.c |  3 ---
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d5bf4dec6b0..3e8aeb8b61c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,7 +61,6 @@ extern const struct file_operations proc_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
-extern const struct file_operations proc_kmsg_operations;
 extern const struct inode_operations proc_net_inode_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 9fd5df3f40c..7ca78346d3f 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -10,13 +10,12 @@
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/poll.h>
+#include <linux/proc_fs.h>
 #include <linux/fs.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-#include "internal.h"
-
 extern wait_queue_head_t log_wait;
 
 extern int do_syslog(int type, char __user *bug, int count);
@@ -49,9 +48,16 @@ static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 }
 
 
-const struct file_operations proc_kmsg_operations = {
+static const struct file_operations proc_kmsg_operations = {
 	.read		= kmsg_read,
 	.poll		= kmsg_poll,
 	.open		= kmsg_open,
 	.release	= kmsg_release,
 };
+
+static int __init proc_kmsg_init(void)
+{
+	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+	return 0;
+}
+module_init(proc_kmsg_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e177f42496c..fcac25edaef 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -605,9 +605,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_PRINTK
-	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
-#endif
 #ifdef CONFIG_FILE_LOCKING
 	proc_create("locks", 0, NULL, &proc_locks_operations);
 #endif
-- 
cgit v1.2.3


From d8ba7a363393f803c93c8cffabd6d0362618bc2a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 22:34:18 +0400
Subject: proc: move rest of /proc/locks to fs/locks.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/locks.c          | 22 +++++++++++++++++++++-
 fs/proc/proc_misc.c | 17 -----------------
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 5eb259e3cd3..90e87f57b33 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2078,6 +2078,7 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
 #ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
@@ -2183,12 +2184,31 @@ static void locks_stop(struct seq_file *f, void *v)
 	unlock_kernel();
 }
 
-struct seq_operations locks_seq_operations = {
+static const struct seq_operations locks_seq_operations = {
 	.start	= locks_start,
 	.next	= locks_next,
 	.stop	= locks_stop,
 	.show	= locks_show,
 };
+
+static int locks_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &locks_seq_operations);
+}
+
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_locks_init(void)
+{
+	proc_create("locks", 0, NULL, &proc_locks_operations);
+	return 0;
+}
+module_init(proc_locks_init);
 #endif
 
 /**
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index fcac25edaef..fea7d658fff 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -453,20 +453,6 @@ static const struct file_operations proc_interrupts_operations = {
 	.release	= seq_release,
 };
 
-#ifdef CONFIG_FILE_LOCKING
-static int locks_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &locks_seq_operations);
-}
-
-static const struct file_operations proc_locks_operations = {
-	.open		= locks_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif /* CONFIG_FILE_LOCKING */
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -605,9 +591,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_FILE_LOCKING
-	proc_create("locks", 0, NULL, &proc_locks_operations);
-#endif
 	proc_create("devices", 0, NULL, &proc_devinfo_operations);
 	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
-- 
cgit v1.2.3


From fe2510426a02dd2fefcb0389b60f77cbe0462289 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 23:11:37 +0400
Subject: proc: move /proc/devices code to fs/proc/devices.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  1 +
 fs/proc/devices.c   | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c | 60 ---------------------------------------------
 3 files changed, 71 insertions(+), 60 deletions(-)
 create mode 100644 fs/proc/devices.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 48f9f0f121b..f24ebfdc5b4 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
 proc-y	+= cmdline.o
+proc-y	+= devices.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
 proc-y	+= uptime.o
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
new file mode 100644
index 00000000000..59ee7da959c
--- /dev/null
+++ b/fs/proc/devices.c
@@ -0,0 +1,70 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int devinfo_show(struct seq_file *f, void *v)
+{
+	int i = *(loff_t *) v;
+
+	if (i < CHRDEV_MAJOR_HASH_SIZE) {
+		if (i == 0)
+			seq_printf(f, "Character devices:\n");
+		chrdev_show(f, i);
+	}
+#ifdef CONFIG_BLOCK
+	else {
+		i -= CHRDEV_MAJOR_HASH_SIZE;
+		if (i == 0)
+			seq_printf(f, "\nBlock devices:\n");
+		blkdev_show(f, i);
+	}
+#endif
+	return 0;
+}
+
+static void *devinfo_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return pos;
+	return NULL;
+}
+
+static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return NULL;
+	return pos;
+}
+
+static void devinfo_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations devinfo_ops = {
+	.start = devinfo_start,
+	.next  = devinfo_next,
+	.stop  = devinfo_stop,
+	.show  = devinfo_show
+};
+
+static int devinfo_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &devinfo_ops);
+}
+
+static const struct file_operations proc_devinfo_operations = {
+	.open		= devinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_devices_init(void)
+{
+	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	return 0;
+}
+module_init(proc_devices_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index fea7d658fff..a6fadc0cc4b 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -107,65 +107,6 @@ static const struct file_operations proc_cpuinfo_operations = {
 	.release	= seq_release,
 };
 
-static int devinfo_show(struct seq_file *f, void *v)
-{
-	int i = *(loff_t *) v;
-
-	if (i < CHRDEV_MAJOR_HASH_SIZE) {
-		if (i == 0)
-			seq_printf(f, "Character devices:\n");
-		chrdev_show(f, i);
-	}
-#ifdef CONFIG_BLOCK
-	else {
-		i -= CHRDEV_MAJOR_HASH_SIZE;
-		if (i == 0)
-			seq_printf(f, "\nBlock devices:\n");
-		blkdev_show(f, i);
-	}
-#endif
-	return 0;
-}
-
-static void *devinfo_start(struct seq_file *f, loff_t *pos)
-{
-	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
-		return pos;
-	return NULL;
-}
-
-static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
-{
-	(*pos)++;
-	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
-		return NULL;
-	return pos;
-}
-
-static void devinfo_stop(struct seq_file *f, void *v)
-{
-	/* Nothing to do */
-}
-
-static const struct seq_operations devinfo_ops = {
-	.start = devinfo_start,
-	.next  = devinfo_next,
-	.stop  = devinfo_stop,
-	.show  = devinfo_show
-};
-
-static int devinfo_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &devinfo_ops);
-}
-
-static const struct file_operations proc_devinfo_operations = {
-	.open		= devinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int vmstat_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &vmstat_op);
@@ -591,7 +532,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
 	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
 	proc_create("partitions", 0, NULL, &proc_partitions_operations);
-- 
cgit v1.2.3


From 8591cf43224980a0bc9216a4e50b0a740f8cba35 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 23:40:23 +0400
Subject: proc: move /proc/cpuinfo code to fs/proc/cpuinfo.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  1 +
 fs/proc/cpuinfo.c   | 24 ++++++++++++++++++++++++
 fs/proc/proc_misc.c | 14 --------------
 3 files changed, 25 insertions(+), 14 deletions(-)
 create mode 100644 fs/proc/cpuinfo.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index f24ebfdc5b4..c91cc9c283b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,6 +10,7 @@ proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
 proc-y	+= cmdline.o
+proc-y	+= cpuinfo.o
 proc-y	+= devices.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
new file mode 100644
index 00000000000..5a1e539a234
--- /dev/null
+++ b/fs/proc/cpuinfo.c
@@ -0,0 +1,24 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern const struct seq_operations cpuinfo_op;
+static int cpuinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuinfo_op);
+}
+
+static const struct file_operations proc_cpuinfo_operations = {
+	.open		= cpuinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_cpuinfo_init(void)
+{
+	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	return 0;
+}
+module_init(proc_cpuinfo_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a6fadc0cc4b..8974809be5f 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -94,19 +94,6 @@ static const struct file_operations proc_zoneinfo_file_operations = {
 	.release	= seq_release,
 };
 
-extern const struct seq_operations cpuinfo_op;
-static int cpuinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &cpuinfo_op);
-}
-
-static const struct file_operations proc_cpuinfo_operations = {
-	.open		= cpuinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int vmstat_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &vmstat_op);
@@ -532,7 +519,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
 	proc_create("partitions", 0, NULL, &proc_partitions_operations);
 #endif
-- 
cgit v1.2.3


From f500975a3f3ecf3611d79f1d933906753460b9f2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 4 Oct 2008 23:53:21 +0400
Subject: proc: move rest of /proc/partitions code to block/genhd.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/proc/proc_misc.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8974809be5f..253ea50c439 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -106,17 +106,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 };
 
 #ifdef CONFIG_BLOCK
-static int partitions_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &partitions_op);
-}
-static const struct file_operations proc_partitions_operations = {
-	.open		= partitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int diskstats_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &diskstats_op);
@@ -519,9 +508,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_BLOCK
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
-#endif
 	proc_create("stat", 0, NULL, &proc_stat_operations);
 	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
 #ifdef CONFIG_SLABINFO
-- 
cgit v1.2.3


From df8106dbb59a8c167ea16631059ecb5f7d77da13 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 5 Oct 2008 00:01:56 +0400
Subject: proc: move /proc/stat to fs/proc/stat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |   1 +
 fs/proc/proc_misc.c | 134 ---------------------------------------------
 fs/proc/stat.c      | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 134 deletions(-)
 create mode 100644 fs/proc/stat.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index c91cc9c283b..13ffa7b8d92 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -14,6 +14,7 @@ proc-y	+= cpuinfo.o
 proc-y	+= devices.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
+proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 253ea50c439..a87802fc931 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -198,139 +198,6 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
 
-#ifndef arch_irq_stat_cpu
-#define arch_irq_stat_cpu(cpu) 0
-#endif
-#ifndef arch_irq_stat
-#define arch_irq_stat() 0
-#endif
-
-static int show_stat(struct seq_file *p, void *v)
-{
-	int i, j;
-	unsigned long jif;
-	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
-	u64 sum = 0;
-	struct timespec boottime;
-	unsigned int per_irq_sum;
-
-	user = nice = system = idle = iowait =
-		irq = softirq = steal = cputime64_zero;
-	guest = cputime64_zero;
-	getboottime(&boottime);
-	jif = boottime.tv_sec;
-
-	for_each_possible_cpu(i) {
-		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
-		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
-		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
-		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
-		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
-		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
-		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
-		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
-			sum += kstat_irqs_cpu(j, i);
-
-		sum += arch_irq_stat_cpu(i);
-	}
-	sum += arch_irq_stat();
-
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
-		(unsigned long long)cputime64_to_clock_t(user),
-		(unsigned long long)cputime64_to_clock_t(nice),
-		(unsigned long long)cputime64_to_clock_t(system),
-		(unsigned long long)cputime64_to_clock_t(idle),
-		(unsigned long long)cputime64_to_clock_t(iowait),
-		(unsigned long long)cputime64_to_clock_t(irq),
-		(unsigned long long)cputime64_to_clock_t(softirq),
-		(unsigned long long)cputime64_to_clock_t(steal),
-		(unsigned long long)cputime64_to_clock_t(guest));
-	for_each_online_cpu(i) {
-
-		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kstat_cpu(i).cpustat.user;
-		nice = kstat_cpu(i).cpustat.nice;
-		system = kstat_cpu(i).cpustat.system;
-		idle = kstat_cpu(i).cpustat.idle;
-		iowait = kstat_cpu(i).cpustat.iowait;
-		irq = kstat_cpu(i).cpustat.irq;
-		softirq = kstat_cpu(i).cpustat.softirq;
-		steal = kstat_cpu(i).cpustat.steal;
-		guest = kstat_cpu(i).cpustat.guest;
-		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
-			i,
-			(unsigned long long)cputime64_to_clock_t(user),
-			(unsigned long long)cputime64_to_clock_t(nice),
-			(unsigned long long)cputime64_to_clock_t(system),
-			(unsigned long long)cputime64_to_clock_t(idle),
-			(unsigned long long)cputime64_to_clock_t(iowait),
-			(unsigned long long)cputime64_to_clock_t(irq),
-			(unsigned long long)cputime64_to_clock_t(softirq),
-			(unsigned long long)cputime64_to_clock_t(steal),
-			(unsigned long long)cputime64_to_clock_t(guest));
-	}
-	seq_printf(p, "intr %llu", (unsigned long long)sum);
-
-	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
-		per_irq_sum = 0;
-
-		for_each_possible_cpu(i)
-			per_irq_sum += kstat_irqs_cpu(j, i);
-
-		seq_printf(p, " %u", per_irq_sum);
-	}
-
-	seq_printf(p,
-		"\nctxt %llu\n"
-		"btime %lu\n"
-		"processes %lu\n"
-		"procs_running %lu\n"
-		"procs_blocked %lu\n",
-		nr_context_switches(),
-		(unsigned long)jif,
-		total_forks,
-		nr_running(),
-		nr_iowait());
-
-	return 0;
-}
-
-static int stat_open(struct inode *inode, struct file *file)
-{
-	unsigned size = 4096 * (1 + num_possible_cpus() / 32);
-	char *buf;
-	struct seq_file *m;
-	int res;
-
-	/* don't ask for more than the kmalloc() max size, currently 128 KB */
-	if (size > 128 * 1024)
-		size = 128 * 1024;
-	buf = kmalloc(size, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	res = single_open(file, show_stat, NULL);
-	if (!res) {
-		m = file->private_data;
-		m->buf = buf;
-		m->size = size;
-	} else
-		kfree(buf);
-	return res;
-}
-static const struct file_operations proc_stat_operations = {
-	.open		= stat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /*
  * /proc/interrupts
  */
@@ -508,7 +375,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("stat", 0, NULL, &proc_stat_operations);
 	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
 #ifdef CONFIG_SLABINFO
 	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
new file mode 100644
index 00000000000..81904f07679
--- /dev/null
+++ b/fs/proc/stat.c
@@ -0,0 +1,153 @@
+#include <linux/cpumask.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <asm/cputime.h>
+
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
+static int show_stat(struct seq_file *p, void *v)
+{
+	int i, j;
+	unsigned long jif;
+	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t guest;
+	u64 sum = 0;
+	struct timespec boottime;
+	unsigned int per_irq_sum;
+
+	user = nice = system = idle = iowait =
+		irq = softirq = steal = cputime64_zero;
+	guest = cputime64_zero;
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	for_each_possible_cpu(i) {
+		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
+		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
+		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
+		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
+		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
+		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+
+		for_each_irq_nr(j)
+			sum += kstat_irqs_cpu(j, i);
+
+		sum += arch_irq_stat_cpu(i);
+	}
+	sum += arch_irq_stat();
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+		(unsigned long long)cputime64_to_clock_t(user),
+		(unsigned long long)cputime64_to_clock_t(nice),
+		(unsigned long long)cputime64_to_clock_t(system),
+		(unsigned long long)cputime64_to_clock_t(idle),
+		(unsigned long long)cputime64_to_clock_t(iowait),
+		(unsigned long long)cputime64_to_clock_t(irq),
+		(unsigned long long)cputime64_to_clock_t(softirq),
+		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(guest));
+	for_each_online_cpu(i) {
+
+		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+		user = kstat_cpu(i).cpustat.user;
+		nice = kstat_cpu(i).cpustat.nice;
+		system = kstat_cpu(i).cpustat.system;
+		idle = kstat_cpu(i).cpustat.idle;
+		iowait = kstat_cpu(i).cpustat.iowait;
+		irq = kstat_cpu(i).cpustat.irq;
+		softirq = kstat_cpu(i).cpustat.softirq;
+		steal = kstat_cpu(i).cpustat.steal;
+		guest = kstat_cpu(i).cpustat.guest;
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			i,
+			(unsigned long long)cputime64_to_clock_t(user),
+			(unsigned long long)cputime64_to_clock_t(nice),
+			(unsigned long long)cputime64_to_clock_t(system),
+			(unsigned long long)cputime64_to_clock_t(idle),
+			(unsigned long long)cputime64_to_clock_t(iowait),
+			(unsigned long long)cputime64_to_clock_t(irq),
+			(unsigned long long)cputime64_to_clock_t(softirq),
+			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(guest));
+	}
+	seq_printf(p, "intr %llu", (unsigned long long)sum);
+
+	/* sum again ? it could be updated? */
+	for_each_irq_nr(j) {
+		per_irq_sum = 0;
+
+		for_each_possible_cpu(i)
+			per_irq_sum += kstat_irqs_cpu(j, i);
+
+		seq_printf(p, " %u", per_irq_sum);
+	}
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		nr_context_switches(),
+		(unsigned long)jif,
+		total_forks,
+		nr_running(),
+		nr_iowait());
+
+	return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+	unsigned size = 4096 * (1 + num_possible_cpus() / 32);
+	char *buf;
+	struct seq_file *m;
+	int res;
+
+	/* don't ask for more than the kmalloc() max size, currently 128 KB */
+	if (size > 128 * 1024)
+		size = 128 * 1024;
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	res = single_open(file, show_stat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+static const struct file_operations proc_stat_operations = {
+	.open		= stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_stat_init(void)
+{
+	proc_create("stat", 0, NULL, &proc_stat_operations);
+	return 0;
+}
+module_init(proc_stat_init);
-- 
cgit v1.2.3


From d6917e19f3fda8e1f88bc23ddceed952927bd716 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 5 Oct 2008 00:08:44 +0400
Subject: proc: move /proc/interrupts boilerplate code to fs/proc/interrupts.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile     |  1 +
 fs/proc/interrupts.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c  | 40 ---------------------------------------
 3 files changed, 54 insertions(+), 40 deletions(-)
 create mode 100644 fs/proc/interrupts.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 13ffa7b8d92..b921f4475c9 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -12,6 +12,7 @@ proc-y       += inode.o root.o base.o generic.o array.o \
 proc-y	+= cmdline.o
 proc-y	+= cpuinfo.o
 proc-y	+= devices.o
+proc-y	+= interrupts.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
 proc-y	+= stat.o
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
new file mode 100644
index 00000000000..05029c0e2f2
--- /dev/null
+++ b/fs/proc/interrupts.c
@@ -0,0 +1,53 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irqnr.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/interrupts
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	return (*pos <= nr_irqs) ? pos : NULL;
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos > nr_irqs)
+		return NULL;
+	return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations int_seq_ops = {
+	.start = int_seq_start,
+	.next  = int_seq_next,
+	.stop  = int_seq_stop,
+	.show  = show_interrupts
+};
+
+static int interrupts_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &int_seq_ops);
+}
+
+static const struct file_operations proc_interrupts_operations = {
+	.open		= interrupts_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_interrupts_init(void)
+{
+	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+	return 0;
+}
+module_init(proc_interrupts_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a87802fc931..9e1d2684ce9 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -198,45 +198,6 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
 
-/*
- * /proc/interrupts
- */
-static void *int_seq_start(struct seq_file *f, loff_t *pos)
-{
-	return (*pos <= nr_irqs) ? pos : NULL;
-}
-
-
-static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
-{
-	(*pos)++;
-	return (*pos <= nr_irqs) ? pos : NULL;
-}
-
-static void int_seq_stop(struct seq_file *f, void *v)
-{
-	/* Nothing to do */
-}
-
-static const struct seq_operations int_seq_ops = {
-	.start = int_seq_start,
-	.next  = int_seq_next,
-	.stop  = int_seq_stop,
-	.show  = show_interrupts
-};
-
-static int interrupts_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &int_seq_ops);
-}
-
-static const struct file_operations proc_interrupts_operations = {
-	.open		= interrupts_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -375,7 +336,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
 #ifdef CONFIG_SLABINFO
 	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
-- 
cgit v1.2.3


From a0ec95a8e69792e4ad642daac037c9b01ea3e2cd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 00:59:10 +0400
Subject: proc: move /proc/slab_allocators boilerplate to mm/slab.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 fs/proc/proc_misc.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 9e1d2684ce9..5bca02842d0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -144,33 +144,6 @@ static const struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
-
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-extern const struct seq_operations slabstats_op;
-static int slabstats_open(struct inode *inode, struct file *file)
-{
-	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	int ret = -ENOMEM;
-	if (n) {
-		ret = seq_open(file, &slabstats_op);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
-			m->private = n;
-			n = NULL;
-		}
-		kfree(n);
-	}
-	return ret;
-}
-
-static const struct file_operations proc_slabstats_operations = {
-	.open		= slabstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-#endif
 #endif
 
 #ifdef CONFIG_MMU
@@ -338,9 +311,6 @@ void __init proc_misc_init(void)
 	/* And now for trickier ones */
 #ifdef CONFIG_SLABINFO
 	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
-#endif
 #endif
 #ifdef CONFIG_MMU
 	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
-- 
cgit v1.2.3


From 7b3c3a50a3e0ea46815150d420fa276ac254572b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 02:42:17 +0400
Subject: proc: move /proc/slabinfo boilerplate to mm/slub.c, mm/slab.c

Lose dummy ->write hook in case of SLUB, it's possible now.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 fs/proc/proc_misc.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5bca02842d0..1d6d5c5cc2a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -132,20 +132,6 @@ static const struct file_operations proc_modules_operations = {
 };
 #endif
 
-#ifdef CONFIG_SLABINFO
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slabinfo_op);
-}
-static const struct file_operations proc_slabinfo_operations = {
-	.open		= slabinfo_open,
-	.read		= seq_read,
-	.write		= slabinfo_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 #ifdef CONFIG_MMU
 static int vmalloc_open(struct inode *inode, struct file *file)
 {
@@ -309,9 +295,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_SLABINFO
-	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
-#endif
 #ifdef CONFIG_MMU
 	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
 #endif
-- 
cgit v1.2.3


From 5f6a6a9c4e4d790aae55cb412a7643329057c5e0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 03:50:47 +0400
Subject: proc: move /proc/vmallocinfo to mm/vmalloc.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
---
 fs/proc/proc_misc.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1d6d5c5cc2a..fd41a032456 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -132,31 +132,6 @@ static const struct file_operations proc_modules_operations = {
 };
 #endif
 
-#ifdef CONFIG_MMU
-static int vmalloc_open(struct inode *inode, struct file *file)
-{
-	unsigned int *ptr = NULL;
-	int ret;
-
-	if (NUMA_BUILD)
-		ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
-	ret = seq_open(file, &vmalloc_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = ptr;
-	} else
-		kfree(ptr);
-	return ret;
-}
-
-static const struct file_operations proc_vmalloc_operations = {
-	.open		= vmalloc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-#endif
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -295,9 +270,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_MMU
-	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
-#endif
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
-- 
cgit v1.2.3


From 8f32f7e5ac2ed11b0659b6b55af926f3d58ffd9d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:13:52 +0400
Subject: proc: move /proc/buddyinfo boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index fd41a032456..a35e50659b8 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,19 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-static int fragmentation_open(struct inode *inode, struct file *file)
-{
-	(void)inode;
-	return seq_open(file, &fragmentation_op);
-}
-
-static const struct file_operations fragmentation_file_operations = {
-	.open		= fragmentation_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int pagetypeinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &pagetypeinfo_op);
@@ -270,7 +257,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
-- 
cgit v1.2.3


From 74e2e8e8ce7b3c0f878a349f9fa6cf2831548eef Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:15:36 +0400
Subject: proc: move /proc/pagetypeinfo boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a35e50659b8..900331a634e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,18 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-static int pagetypeinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pagetypeinfo_op);
-}
-
-static const struct file_operations pagetypeinfo_file_ops = {
-	.open		= pagetypeinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int zoneinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &zoneinfo_op);
@@ -257,7 +245,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
 	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
-- 
cgit v1.2.3


From b6aa44ab698c7df9d951d3eb45c4fcb8ba68fb25 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:17:48 +0400
Subject: proc: move /proc/vmstat boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
---
 fs/proc/proc_misc.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 900331a634e..e7a301d5d43 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -69,17 +69,6 @@ static const struct file_operations proc_zoneinfo_file_operations = {
 	.release	= seq_release,
 };
 
-static int vmstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &vmstat_op);
-}
-static const struct file_operations proc_vmstat_file_operations = {
-	.open		= vmstat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_BLOCK
 static int diskstats_open(struct inode *inode, struct file *file)
 {
@@ -245,7 +234,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-- 
cgit v1.2.3


From 5c9fe6281b75832e8d2555ec8700ea763d9a865e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 04:19:42 +0400
Subject: proc: move /proc/zoneinfo boilerplate to mm/vmstat.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
---
 fs/proc/proc_misc.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e7a301d5d43..8f3a6f085c5 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,18 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-static int zoneinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zoneinfo_op);
-}
-
-static const struct file_operations proc_zoneinfo_file_operations = {
-	.open		= zoneinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_BLOCK
 static int diskstats_open(struct inode *inode, struct file *file)
 {
@@ -234,7 +222,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #ifdef CONFIG_BLOCK
 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
 #endif
-- 
cgit v1.2.3


From 31d85ab28e71b0c938e0ef48af45747e80d99b53 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 12:55:38 +0400
Subject: proc: move /proc/diskstats boilerplate to block/genhd.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/proc/proc_misc.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8f3a6f085c5..7c22831efd9 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,19 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-#ifdef CONFIG_BLOCK
-static int diskstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &diskstats_op);
-}
-static const struct file_operations proc_diskstats_operations = {
-	.open		= diskstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 #ifdef CONFIG_MODULES
 extern const struct seq_operations modules_op;
 static int modules_open(struct inode *inode, struct file *file)
@@ -222,9 +209,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_BLOCK
-	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-#endif
 #ifdef CONFIG_MODULES
 	proc_create("modules", 0, NULL, &proc_modules_operations);
 #endif
-- 
cgit v1.2.3


From 3b5d5c6b0ccba733a313f8752ebc3f8015628ba3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 13:19:27 +0400
Subject: proc: move /proc/modules boilerplate to kernel/module.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7c22831efd9..f6d25db9892 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,20 +57,6 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-#ifdef CONFIG_MODULES
-extern const struct seq_operations modules_op;
-static int modules_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &modules_op);
-}
-static const struct file_operations proc_modules_operations = {
-	.open		= modules_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 #ifdef CONFIG_PROC_PAGE_MONITOR
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
@@ -209,9 +195,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_MODULES
-	proc_create("modules", 0, NULL, &proc_modules_operations);
-#endif
 #ifdef CONFIG_SCHEDSTATS
 	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
 #endif
-- 
cgit v1.2.3


From b5aadf7f14c1acc94956aa257e018e9de3881f41 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 13:23:43 +0400
Subject: proc: move /proc/schedstat boilerplate to kernel/sched_stats.h

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index f6d25db9892..4a768ed5da2 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -195,9 +195,6 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_SCHEDSTATS
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
 	if (proc_root_kcore)
-- 
cgit v1.2.3


From 97ce5d6dcb07c403c0fc6001b755aacc38b5d7ff Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:14:19 +0400
Subject: proc: move all /proc/kcore stuff to fs/proc/kcore.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/kcore.c     | 14 +++++++++++++-
 fs/proc/proc_misc.c |  8 --------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c2370c76fb7..59b43a06887 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -27,6 +27,8 @@
 #define ELF_CORE_EFLAGS	0
 #endif
 
+static struct proc_dir_entry *proc_root_kcore;
+
 static int open_kcore(struct inode * inode, struct file * filp)
 {
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
@@ -34,7 +36,7 @@ static int open_kcore(struct inode * inode, struct file * filp)
 
 static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
 
-const struct file_operations proc_kcore_operations = {
+static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
 };
@@ -399,3 +401,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 	return acc;
 }
+
+static int __init proc_kcore_init(void)
+{
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
+	if (proc_root_kcore)
+		proc_root_kcore->size =
+				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
+	return 0;
+}
+module_init(proc_kcore_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 4a768ed5da2..5ed15ff8fd1 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -188,19 +188,11 @@ static struct file_operations proc_kpageflags_operations = {
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
-struct proc_dir_entry *proc_root_kcore;
-
 void __init proc_misc_init(void)
 {
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_PROC_KCORE
-	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
-	if (proc_root_kcore)
-		proc_root_kcore->size =
-				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
-#endif
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
-- 
cgit v1.2.3


From 6d80e53f0056178c63fa8fbf3e8de40fb4df5f50 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:26:12 +0400
Subject: proc: move pagecount stuff to fs/proc/page.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |   1 +
 fs/proc/page.c      | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c | 135 -----------------------------------------------
 3 files changed, 148 insertions(+), 135 deletions(-)
 create mode 100644 fs/proc/page.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index b921f4475c9..fef524410e8 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -24,3 +24,4 @@ proc-$(CONFIG_PROC_KCORE)	+= kcore.o
 proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
 proc-$(CONFIG_PRINTK)	+= kmsg.o
+proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
diff --git a/fs/proc/page.c b/fs/proc/page.c
new file mode 100644
index 00000000000..767d95a6d1b
--- /dev/null
+++ b/fs/proc/page.c
@@ -0,0 +1,147 @@
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 pcount;
+
+	pfn = src / KPMSIZE;
+	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		ppage = NULL;
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		pfn++;
+		if (!ppage)
+			pcount = 0;
+		else
+			pcount = page_mapcount(ppage);
+
+		if (put_user(pcount, out++)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpagecount_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecount_read,
+};
+
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
+
+/* These macros are used to decouple internal flags from exported ones */
+
+#define KPF_LOCKED     0
+#define KPF_ERROR      1
+#define KPF_REFERENCED 2
+#define KPF_UPTODATE   3
+#define KPF_DIRTY      4
+#define KPF_LRU        5
+#define KPF_ACTIVE     6
+#define KPF_SLAB       7
+#define KPF_WRITEBACK  8
+#define KPF_RECLAIM    9
+#define KPF_BUDDY     10
+
+#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
+
+static ssize_t kpageflags_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 kflags, uflags;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		ppage = NULL;
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		pfn++;
+		if (!ppage)
+			kflags = 0;
+		else
+			kflags = ppage->flags;
+
+		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
+			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
+			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
+			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
+			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
+			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
+			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
+			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
+			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
+			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
+
+		if (put_user(uflags, out++)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpageflags_operations = {
+	.llseek = mem_lseek,
+	.read = kpageflags_read,
+};
+
+static int __init proc_page_init(void)
+{
+	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
+	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+	return 0;
+}
+module_init(proc_page_init);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5ed15ff8fd1..2ef9ef9bc8c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -57,146 +57,11 @@
 #include <asm/div64.h>
 #include "internal.h"
 
-#ifdef CONFIG_PROC_PAGE_MONITOR
-#define KPMSIZE sizeof(u64)
-#define KPMMASK (KPMSIZE - 1)
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
-	u64 pcount;
-
-	pfn = src / KPMSIZE;
-	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		ppage = NULL;
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		pfn++;
-		if (!ppage)
-			pcount = 0;
-		else
-			pcount = page_mapcount(ppage);
-
-		if (put_user(pcount, out++)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		count -= KPMSIZE;
-	}
-
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
-}
-
-static struct file_operations proc_kpagecount_operations = {
-	.llseek = mem_lseek,
-	.read = kpagecount_read,
-};
-
-/* /proc/kpageflags - an array exposing page flags
- *
- * Each entry is a u64 representing the corresponding
- * physical page flags.
- */
-
-/* These macros are used to decouple internal flags from exported ones */
-
-#define KPF_LOCKED     0
-#define KPF_ERROR      1
-#define KPF_REFERENCED 2
-#define KPF_UPTODATE   3
-#define KPF_DIRTY      4
-#define KPF_LRU        5
-#define KPF_ACTIVE     6
-#define KPF_SLAB       7
-#define KPF_WRITEBACK  8
-#define KPF_RECLAIM    9
-#define KPF_BUDDY     10
-
-#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
-
-static ssize_t kpageflags_read(struct file *file, char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
-	u64 kflags, uflags;
-
-	pfn = src / KPMSIZE;
-	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		ppage = NULL;
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		pfn++;
-		if (!ppage)
-			kflags = 0;
-		else
-			kflags = ppage->flags;
-
-		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
-			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
-			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
-			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
-			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
-			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
-			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
-			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
-			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
-			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
-
-		if (put_user(uflags, out++)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		count -= KPMSIZE;
-	}
-
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
-}
-
-static struct file_operations proc_kpageflags_operations = {
-	.llseek = mem_lseek,
-	.read = kpageflags_read,
-};
-#endif /* CONFIG_PROC_PAGE_MONITOR */
-
 void __init proc_misc_init(void)
 {
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_PROC_PAGE_MONITOR
-	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
-	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
-#endif
 #ifdef CONFIG_PROC_VMCORE
 	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
 #endif
-- 
cgit v1.2.3


From 5aa140c2deca3701238d5acddf436ad7b02664c7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:36:31 +0400
Subject: proc: move /proc/vmcore creation to fs/proc/vmcore.c

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/proc_misc.c | 3 ---
 fs/proc/vmcore.c    | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 2ef9ef9bc8c..e2db35006c0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -62,7 +62,4 @@ void __init proc_misc_init(void)
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	/* And now for trickier ones */
-#ifdef CONFIG_PROC_VMCORE
-	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
-#endif
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd9ca67f841..03ec5950490 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,7 +32,7 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
-struct proc_dir_entry *proc_vmcore = NULL;
+static struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
@@ -162,7 +162,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 	return acc;
 }
 
-const struct file_operations proc_vmcore_operations = {
+static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
 };
 
@@ -652,7 +652,7 @@ static int __init vmcore_init(void)
 		return rc;
 	}
 
-	/* Initialize /proc/vmcore size if proc is already up. */
+	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
 	if (proc_vmcore)
 		proc_vmcore->size = vmcore_size;
 	return 0;
-- 
cgit v1.2.3


From 59c7572e82d69483a66eaa67b46548baeb69ecf4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 6 Oct 2008 14:49:39 +0400
Subject: proc: remove fs/proc/proc_misc.c

Now that everything was moved to their more or less expected places,
apply rm(1).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/Makefile    |  2 +-
 fs/proc/proc_misc.c | 65 -----------------------------------------------------
 fs/proc/root.c      |  2 +-
 3 files changed, 2 insertions(+), 67 deletions(-)
 delete mode 100644 fs/proc/proc_misc.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index fef524410e8..63d965193b2 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,7 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		proc_tty.o proc_misc.o
+		proc_tty.o
 proc-y	+= cmdline.o
 proc-y	+= cpuinfo.o
 proc-y	+= devices.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
deleted file mode 100644
index e2db35006c0..00000000000
--- a/fs/proc/proc_misc.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  linux/fs/proc/proc_misc.c
- *
- *  linux/fs/proc/array.c
- *  Copyright (C) 1992  by Linus Torvalds
- *  based on ideas by Darren Senn
- *
- *  This used to be the part of array.c. See the rest of history and credits
- *  there. I took this into a separate file and switched the thing to generic
- *  proc_file_inode_operations, leaving in array.c only per-process stuff.
- *  Inumbers allocation made dynamic (via create_proc_entry()).  AV, May 1999.
- *
- * Changes:
- * Fulton Green      :  Encapsulated position metric calculations.
- *			<kernel@FultonGreen.com>
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/fs.h>
-#include <linux/tty.h>
-#include <linux/string.h>
-#include <linux/mman.h>
-#include <linux/quicklist.h>
-#include <linux/proc_fs.h>
-#include <linux/ioport.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/pagemap.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/swap.h>
-#include <linux/slab.h>
-#include <linux/genhd.h>
-#include <linux/smp.h>
-#include <linux/signal.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/times.h>
-#include <linux/profile.h>
-#include <linux/utsname.h>
-#include <linux/blkdev.h>
-#include <linux/hugetlb.h>
-#include <linux/jiffies.h>
-#include <linux/vmalloc.h>
-#include <linux/crash_dump.h>
-#include <linux/pid_namespace.h>
-#include <linux/bootmem.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-#include <asm/tlb.h>
-#include <asm/div64.h>
-#include "internal.h"
-
-void __init proc_misc_init(void)
-{
-	proc_symlink("mounts", NULL, "self/mounts");
-
-	/* And now for trickier ones */
-}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 2a3abd25b30..7761602af9d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -117,7 +117,7 @@ void __init proc_root_init(void)
 		return;
 	}
 
-	proc_misc_init();
+	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
 
-- 
cgit v1.2.3


From 4afe978530702c934dfdb11f54073136818b2119 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Wed, 22 Oct 2008 14:15:00 -0700
Subject: jbd: fix error handling for checkpoint io

When a checkpointing IO fails, current JBD code doesn't check the error
and continue journaling.  This means latest metadata can be lost from both
the journal and filesystem.

This patch leaves the failed metadata blocks in the journal space and
aborts journaling in the case of log_do_checkpoint().  To achieve this, we
need to do:

1. don't remove the failed buffer from the checkpoint list where in
   the case of __try_to_free_cp_buf() because it may be released or
   overwritten by a later transaction
2. log_do_checkpoint() is the last chance, remove the failed buffer
   from the checkpoint list and abort the journal
3. when checkpointing fails, don't update the journal super block to
   prevent the journaled contents from being cleaned.  For safety,
   don't update j_tail and j_tail_sequence either
4. when checkpointing fails, notify this error to the ext3 layer so
   that ext3 don't clear the needs_recovery flag, otherwise the
   journaled contents are ignored and cleaned in the recovery phase
5. if the recovery fails, keep the needs_recovery flag
6. prevent cleanup_journal_tail() from being called between
   __journal_drop_transaction() and journal_abort() (a race issue
   between journal_flush() and __log_wait_for_space()

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/checkpoint.c | 49 +++++++++++++++++++++++++++++++++++++------------
 fs/jbd/journal.c    | 28 ++++++++++++++++++++++------
 fs/jbd/recovery.c   |  7 +++++--
 3 files changed, 64 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb8..e29293501d4 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && buffer_uptodate(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
@@ -160,21 +161,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
+	int ret = 0;
 
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
-		return;
+		return ret;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
@@ -194,6 +199,9 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		if (unlikely(!buffer_uptodate(bh)))
+			ret = -EIO;
+
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
@@ -203,6 +211,8 @@ restart:
 		journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
+
+	return ret;
 }
 
 #define NR_BATCH	64
@@ -226,7 +236,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +267,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(!buffer_uptodate(bh)))
+			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__journal_remove_checkpoint(jh);
@@ -263,7 +277,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		__brelse(bh);
-		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
@@ -295,6 +308,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int log_do_checkpoint(journal_t *journal)
 {
@@ -318,6 +332,7 @@ int log_do_checkpoint(journal_t *journal)
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
+	result = 0;
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
@@ -334,7 +349,7 @@ restart:
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
-		int retry = 0;
+		int retry = 0, err;
 
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
@@ -347,6 +362,8 @@ restart:
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (retry < 0 && !result)
+				result = retry;
 			if (!retry && (need_resched() ||
 				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
@@ -371,14 +388,18 @@ restart:
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
-		__wait_cp_io(journal, transaction);
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-	result = cleanup_journal_tail(journal);
 	if (result < 0)
-		return result;
-	return 0;
+		journal_abort(journal, result);
+	else
+		result = cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
 }
 
 /*
@@ -394,8 +415,9 @@ out:
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
 
 int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +426,9 @@ int cleanup_journal_tail(journal_t *journal)
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 
+	if (is_journal_aborted(journal))
+		return 1;
+
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index aa7143a8349..9e4fa52d7dc 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1121,9 +1121,12 @@ recovery_error:
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void journal_destroy(journal_t *journal)
+int journal_destroy(journal_t *journal)
 {
+	int err = 0;
+
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
 
@@ -1146,11 +1149,16 @@ void journal_destroy(journal_t *journal)
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
 
-	/* We can now mark the journal as empty. */
-	journal->j_tail = 0;
-	journal->j_tail_sequence = ++journal->j_transaction_sequence;
 	if (journal->j_sb_buffer) {
-		journal_update_superblock(journal, 1);
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
 		brelse(journal->j_sb_buffer);
 	}
 
@@ -1160,6 +1168,8 @@ void journal_destroy(journal_t *journal)
 		journal_destroy_revoke(journal);
 	kfree(journal->j_wbuf);
 	kfree(journal);
+
+	return err;
 }
 
 
@@ -1359,10 +1369,16 @@ int journal_flush(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		err = log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
 	cleanup_journal_tail(journal);
 
 	/* Finally, mark the journal as really needing no recovery.
@@ -1384,7 +1400,7 @@ int journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	spin_unlock(&journal->j_state_lock);
-	return err;
+	return 0;
 }
 
 /**
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 43bc5e5ed06..db5e982c5dd 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -223,7 +223,7 @@ do {									\
  */
 int journal_recover(journal_t *journal)
 {
-	int			err;
+	int			err, err2;
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
@@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
 	journal->j_transaction_sequence = ++info.end_transaction;
 
 	journal_clear_revoke(journal);
-	sync_blockdev(journal->j_fs_dev);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 2d7c820e56ce83b23daee9eb5343730fb309418e Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Wed, 22 Oct 2008 14:15:01 -0700
Subject: ext3: add checks for errors from jbd

If the journal has aborted due to a checkpointing failure, we have to
keep the contents of the journal space.  Otherwise, the filesystem will
lose uncheckpointed metadata completely and become inconsistent.  To
avoid this, we need to keep needs_recovery flag if checkpoint has
failed.

With this patch, ext3_put_super() detects a checkpointing failure from
the return value of journal_destroy(), then it invokes ext3_abort() to
make the filesystem read only and keep needs_recovery flag.  Errors
from journal_flush() are also handled by this patch in some places.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Jan Kara <jack@ucw.cz>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ioctl.c | 12 ++++++++----
 fs/ext3/super.c | 23 +++++++++++++++++++----
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 0d0c7015164..b7394d05ee8 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -239,7 +239,7 @@ setrsvsz_out:
 	case EXT3_IOC_GROUP_EXTEND: {
 		ext3_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -254,8 +254,10 @@ setrsvsz_out:
 		}
 		err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		journal_flush(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 group_extend_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
@@ -263,7 +265,7 @@ group_extend_out:
 	case EXT3_IOC_GROUP_ADD: {
 		struct ext3_new_group_data input;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -280,8 +282,10 @@ group_extend_out:
 
 		err = ext3_group_add(sb, &input);
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		journal_flush(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 group_add_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3a260af5544..cac29ee3b14 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -393,7 +393,8 @@ static void ext3_put_super (struct super_block * sb)
 	int i;
 
 	ext3_xattr_put_super(sb);
-	journal_destroy(sbi->s_journal);
+	if (journal_destroy(sbi->s_journal) < 0)
+		ext3_abort(sb, __func__, "Couldn't clean up the journal");
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -2296,7 +2297,9 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 	journal_t *journal = EXT3_SB(sb)->s_journal;
 
 	journal_lock_updates(journal);
-	journal_flush(journal);
+	if (journal_flush(journal) < 0)
+		goto out;
+
 	lock_super(sb);
 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
@@ -2305,6 +2308,8 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 		ext3_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
+
+out:
 	journal_unlock_updates(journal);
 }
 
@@ -2404,7 +2409,13 @@ static void ext3_write_super_lockfs(struct super_block *sb)
 
 		/* Now we set up the journal barrier. */
 		journal_lock_updates(journal);
-		journal_flush(journal);
+
+		/*
+		 * We don't want to clear needs_recovery flag when we failed
+		 * to flush the journal.
+		 */
+		if (journal_flush(journal) < 0)
+			return;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
@@ -2822,8 +2833,12 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		 * otherwise be livelocked...
 		 */
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		journal_flush(EXT3_SB(sb)->s_journal);
+		err = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err) {
+			path_put(&nd.path);
+			return err;
+		}
 	}
 
 	err = vfs_quota_on_path(sb, type, format_id, &nd.path);
-- 
cgit v1.2.3


From 9f818b4ac04f53458d0354950b4f229f54be4dbf Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Wed, 22 Oct 2008 14:15:02 -0700
Subject: jbd: test BH_Write_EIO to detect errors on metadata buffers

__try_to_free_cp_buf(), __process_buffer(), and __wait_cp_io() test
BH_Uptodate flag to detect write I/O errors on metadata buffers.  But by
commit 95450f5a7e53d5752ce1a0d0b8282e10fe745ae0 "ext3: don't read inode
block if the buffer has a write error"(*), BH_Uptodate flag can be set to
inode buffers with BH_Write_EIO in order to avoid reading old inode data.
So now, we have to test BH_Write_EIO flag of checkpointing inode buffers
instead of BH_Uptodate.  This patch does it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/checkpoint.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e29293501d4..fe852193324 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -94,7 +94,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 	struct buffer_head *bh = jh2bh(jh);
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
-	    !buffer_dirty(bh) && buffer_uptodate(bh)) {
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
@@ -199,7 +199,7 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
-		if (unlikely(!buffer_uptodate(bh)))
+		if (unlikely(buffer_write_io_error(bh)))
 			ret = -EIO;
 
 		/*
@@ -268,7 +268,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
 		ret = 1;
-		if (unlikely(!buffer_uptodate(bh)))
+		if (unlikely(buffer_write_io_error(bh)))
 			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
-- 
cgit v1.2.3


From be07c4ed4043ab8c26f222348136141335e47a2f Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 22 Oct 2008 14:15:03 -0700
Subject: jbd: abort instead of waiting for nonexistent transactions

The __log_wait_for_space function sits in a loop checkpointing
transactions until there is sufficient space free in the journal.
However, if there are no transactions to be processed (e.g.  because the
free space calculation is wrong due to a corrupted filesystem) it will
never progress.

Check for space being required when no transactions are outstanding and
abort the journal instead of endlessly looping.

This patch fixes the bug reported by Sami Liedes at:
http://bugzilla.kernel.org/show_bug.cgi?id=10976

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Tested-by: Sami Liedes <sliedes@cc.hut.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/checkpoint.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index fe852193324..1bd8d4acc6f 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -127,14 +127,29 @@ void __log_wait_for_space(journal_t *journal)
 
 		/*
 		 * Test again, another process may have checkpointed while we
-		 * were waiting for the checkpoint lock
+		 * were waiting for the checkpoint lock. If there are no
+		 * outstanding transactions there is nothing to checkpoint and
+		 * we can't make progress. Abort the journal in this case.
 		 */
 		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__log_space_left(journal) < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+
+			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
-			log_do_checkpoint(journal);
+			if (chkpt) {
+				log_do_checkpoint(journal);
+			} else {
+				printk(KERN_ERR "%s: no transactions\n",
+				       __func__);
+				journal_abort(journal, 0);
+			}
+
 			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
-- 
cgit v1.2.3


From 12e1ec9ff31d388305da644b452c9f80d244aa55 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Oct 2008 11:48:56 -0700
Subject: ext3 quota support: fix compile failure

This one was due to a merge error: we added a use of nd.path in commit
2d7c820e56ce83b23daee9eb5343730fb309418e ("ext3: add checks for errors
from jbd"), and concurrently we got rid of 'nd' and used a naked 'path'
in commit 8264613def2e5c4f12bc3167713090fd172e6055 ("[PATCH] switch
quota_on-related stuff to kern_path()").

That all merged cleanly, but it didn't actually _work_.  This should fix
it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8147dd44ced..18eaa78ecb4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2836,7 +2836,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		err = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
 		if (err) {
-			path_put(&nd.path);
+			path_put(&path);
 			return err;
 		}
 	}
-- 
cgit v1.2.3


From 3856d30ded1fe43c6657927ebad402d25cd128f4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Oct 2008 20:33:29 +0200
Subject: ext4: remove unused variable in ext4_get_parent

Signed-off-by: Christoph Hellwig <hch@lst.de>
[ All users removed in "switch all filesystems over to d_obtain_alias",
  aka commit 440037287c5ebb07033ab927ca16bb68c291d309 ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/namei.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5b93a7d94d4..63adcb79298 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1061,7 +1061,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 struct dentry *ext4_get_parent(struct dentry *child)
 {
 	unsigned long ino;
-	struct dentry *parent;
 	struct inode *inode;
 	static const struct qstr dotdot = {
 		.name = "..",
-- 
cgit v1.2.3


From 8c9fa93d51123c5540762b1a9e1919d6f9c4af7c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Oct 2008 11:38:37 -0400
Subject: ext3: Fix duplicate entries returned from getdents() system call

Fix a regression caused by commit 6a897cf4, "ext3: fix ext3_dx_readdir
hash collision handling", where deleting files in a large directory
(requiring more than one getdents system call), results in some
filenames being returned twice.  This was caused by a failure to
update info->curr_hash and info->curr_minor_hash, so that if the
directory had gotten modified since the last getdents() system call
(as would be the case if the user is running "rm -r" or "git clean"),
a directory entry would get returned twice to the userspace.

This patch fixes the bug reported by Markus Trippelsdorf at:
http://bugzilla.kernel.org/show_bug.cgi?id=11844

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Tested-by: Markus Trippelsdorf <markus@trippelsdorf.de>
---
 fs/ext3/dir.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 4c82531ea0a..5853f4440af 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -456,17 +456,8 @@ static int ext3_dx_readdir(struct file * filp,
 	if (info->extra_fname) {
 		if (call_filldir(filp, dirent, filldir, info->extra_fname))
 			goto finished;
-
 		info->extra_fname = NULL;
-		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
-			if (info->next_hash == ~0) {
-				filp->f_pos = EXT3_HTREE_EOF;
-				goto finished;
-			}
-			info->curr_hash = info->next_hash;
-			info->curr_minor_hash = 0;
-		}
+		goto next_node;
 	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
@@ -498,9 +489,14 @@ static int ext3_dx_readdir(struct file * filp,
 		info->curr_minor_hash = fname->minor_hash;
 		if (call_filldir(filp, dirent, filldir, fname))
 			break;
-
+	next_node:
 		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
 			if (info->next_hash == ~0) {
 				filp->f_pos = EXT3_HTREE_EOF;
 				break;
-- 
cgit v1.2.3


From 3c37fc86d20fe35be656f070997d62f75c2e4874 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Oct 2008 11:39:08 -0400
Subject: ext4: Fix duplicate entries returned from getdents() system call

Fix a regression caused by commit d0156417, "ext4: fix ext4_dx_readdir
hash collision handling", where deleting files in a large directory
(requiring more than one getdents system call), results in some
filenames being returned twice.  This was caused by a failure to
update info->curr_hash and info->curr_minor_hash, so that if the
directory had gotten modified since the last getdents() system call
(as would be the case if the user is running "rm -r" or "git clean"),
a directory entry would get returned twice to the userspace.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

This patch fixes the bug reported by Markus Trippelsdorf at:
http://bugzilla.kernel.org/show_bug.cgi?id=11844

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Tested-by: Markus Trippelsdorf <markus@trippelsdorf.de>
---
 fs/ext4/dir.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3ca6a2b7632..fed5b610df5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -459,17 +459,8 @@ static int ext4_dx_readdir(struct file *filp,
 	if (info->extra_fname) {
 		if (call_filldir(filp, dirent, filldir, info->extra_fname))
 			goto finished;
-
 		info->extra_fname = NULL;
-		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
-			if (info->next_hash == ~0) {
-				filp->f_pos = EXT4_HTREE_EOF;
-				goto finished;
-			}
-			info->curr_hash = info->next_hash;
-			info->curr_minor_hash = 0;
-		}
+		goto next_node;
 	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
@@ -501,9 +492,14 @@ static int ext4_dx_readdir(struct file *filp,
 		info->curr_minor_hash = fname->minor_hash;
 		if (call_filldir(filp, dirent, filldir, fname))
 			break;
-
+	next_node:
 		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
 			if (info->next_hash == ~0) {
 				filp->f_pos = EXT4_HTREE_EOF;
 				break;
-- 
cgit v1.2.3


From 4d36a9e65d4966b433b2f3424d9457468bc80e00 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 25 Oct 2008 12:41:41 -0700
Subject: select: deal with math overflow from borderline valid userland data

Some userland apps seem to pass in a "0" for the seconds, and several
seconds worth of usecs to select().  The old kernels accepted this just
fine, so the new kernels must too.

However, due to the upscaling of the microseconds to nanoseconds we had
some cases where we got math overflow, and depending on the GCC version
(due to inlining decisions) that actually resulted in an -EINVAL return.

This patch fixes this by adding the excess microseconds to the seconds
field.

Also with thanks to Marcin Slusarz for spotting some implementation bugs
in the diagnostics patches.

Reported-by: Carlos R. Mafra <crmafra2@gmail.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 5 +++--
 fs/select.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index fe3c9bf8760..e5f49f53850 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1684,8 +1684,9 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 			return -EFAULT;
 
 		to = &end_time;
-		if (poll_select_set_timeout(to, tv.tv_sec,
-					    tv.tv_usec * NSEC_PER_USEC))
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
 			return -EINVAL;
 	}
 
diff --git a/fs/select.c b/fs/select.c
index 448e4400128..87df51eadcf 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -519,8 +519,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EFAULT;
 
 		to = &end_time;
-		if (poll_select_set_timeout(to, tv.tv_sec,
-					    tv.tv_usec * NSEC_PER_USEC))
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 9ce209d64d820a6d5ed6b952e2c0f917faad6031 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Fri, 17 Oct 2008 16:17:40 -0700
Subject: epoll: avoid double-inserts in case of EFAULT

In commit f337b9c58332bdecde965b436e47ea4c94d30da0 ("epoll: drop
unnecessary test") Thomas found that there is an unnecessary (always
true) test in ep_send_events().  The callback never inserts into
->rdllink while the send loop is performed, and also does the
~EP_PRIVATE_BITS test.  Given we're holding the mutex during this time,
the conditions tested inside the loop are always true.

HOWEVER.

The test "!ep_is_linked(&epi->rdllink)" wasn't there because we insert
into ->rdllink, but because the send-events loop might terminate before
the whole list is scanned (-EFAULT).

In such cases, when the loop terminates early, and when a (leftover)
file received an event while we're performing the lockless loop, we need
such test to avoid to double insert the epoll items.  The list_splice()
done a few steps below, will correctly re-insert the ones that were left
on "txlist".

This should fix the kenrel.org bugzilla entry 11831.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 99368bda026..aec5c13f634 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -930,8 +930,15 @@ errxit:
 	 * inside the main ready-list here.
 	 */
 	for (nepi = ep->ovflist; (epi = nepi) != NULL;
-	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR)
-		list_add_tail(&epi->rdllink, &ep->rdllist);
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		/*
+		 * If the above loop quit with errors, the epoll item might still
+		 * be linked to "txlist", and the list_splice() done below will
+		 * take care of those cases.
+		 */
+		if (!ep_is_linked(&epi->rdllink))
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+	}
 	/*
 	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
 	 * releasing the lock, events will be queued in the normal way inside
-- 
cgit v1.2.3


From 526719ba51e7d7bd31f7af9ab04b015b70096685 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 27 Oct 2008 15:19:48 +0000
Subject: Switch to a valid email address...

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/psdev.c | 2 +-
 fs/nfs/inode.c  | 2 +-
 fs/nfs/super.c  | 2 +-
 fs/proc/array.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index cfd29da714d..0376ac66c44 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -2,7 +2,7 @@
  *      	An implementation of a loadable kernel mode driver providing
  *		multiple kernel/user space bidirectional communications links.
  *
- * 		Author: 	Alan Cox <alan@redhat.com>
+ * 		Author: 	Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b9195c02a86..dc52793ff8f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -5,7 +5,7 @@
  *
  *  nfs inode and superblock handling functions
  *
- *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
  *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
  *
  *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a3b0061dfd4..f48db679a1c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -5,7 +5,7 @@
  *
  *  nfs superblock handling functions
  *
- *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
  *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
  *
  *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb9f4b05703..6af7fba7abb 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -40,7 +40,7 @@
  *
  *
  * Alan Cox	     :  security fixes.
- *			<Alan.Cox@linux.org>
+ *			<alan@lxorguk.ukuu.org.uk>
  *
  * Al Viro           :  safe handling of mm_struct
  *
-- 
cgit v1.2.3


From 6c87df37dcb9c6c33923707fa5191e0a65874d60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 27 Oct 2008 22:38:27 +0300
Subject: proc: revert /proc/uptime to ->read_proc hook

Turned out some VMware userspace does pread(2) on /proc/uptime, but
seqfiles currently don't allow pread() resulting in -ESPIPE.

Seqfiles in theory can do pread(), but this can be a long story,
so revert to ->read_proc until then.

http://bugzilla.kernel.org/show_bug.cgi?id=11856

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/uptime.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f14..df26aa88fa4 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,43 +1,45 @@
-#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
-#include <linux/seq_file.h>
 #include <linux/time.h>
 #include <asm/cputime.h>
 
-static int uptime_proc_show(struct seq_file *m, void *v)
+static int proc_calc_metrics(char *page, char **start, off_t off,
+				 int count, int *eof, int len)
+{
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len > count)
+		len = count;
+	if (len < 0)
+		len = 0;
+	return len;
+}
+
+static int uptime_read_proc(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
 {
 	struct timespec uptime;
 	struct timespec idle;
+	int len;
 	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
 	cputime_to_timespec(idletime, &idle);
-	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+	len = sprintf(page, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
 			(unsigned long) idle.tv_sec,
 			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-	return 0;
+	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
-	.open		= uptime_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL);
 	return 0;
 }
 module_init(proc_uptime_init);
-- 
cgit v1.2.3


From 44d6f78756560e95903de239e10f8a40a6eae444 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Oct 2008 22:51:46 -0400
Subject: ext3: fix a bug accessing freed memory in ext3_abort

Vegard Nossum reported a bug which accesses freed memory (found via
kmemcheck).  When journal has been aborted, ext3_put_super() calls
ext3_abort() after freeing the journal_t object, and then ext3_abort()
accesses it.  This patch fix it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/super.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 18eaa78ecb4..e5717a4fae6 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -281,7 +281,8 @@ void ext3_abort (struct super_block * sb, const char * function,
 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
-	journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+	if (EXT3_SB(sb)->s_journal)
+		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
 
 void ext3_warning (struct super_block * sb, const char * function,
@@ -390,11 +391,14 @@ static void ext3_put_super (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
 
 	ext3_xattr_put_super(sb);
-	if (journal_destroy(sbi->s_journal) < 0)
+	err = journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
+	if (err < 0)
 		ext3_abort(sb, __func__, "Couldn't clean up the journal");
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-- 
cgit v1.2.3


From ef2cabf7c6d838eb0ee2b4fb8ef84f7c06ce16d9 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Oct 2008 22:53:05 -0400
Subject: ext4: fix a bug accessing freed memory in ext4_abort

Vegard Nossum reported a bug which accesses freed memory (found via
kmemcheck).  When journal has been aborted, ext4_put_super() calls
ext4_abort() after freeing the journal_t object, and then ext4_abort()
accesses it.  This patch fix it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bdddea14e78..994859df010 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,8 @@ void ext4_abort(struct super_block *sb, const char *function,
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
-	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+	if (EXT4_SB(sb)->s_journal)
+		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
 void ext4_warning(struct super_block *sb, const char *function,
@@ -442,14 +443,16 @@ static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
 
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	if (jbd2_journal_destroy(sbi->s_journal) < 0)
-		ext4_abort(sb, __func__, "Couldn't clean up the journal");
+	err = jbd2_journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
+	if (err < 0)
+		ext4_abort(sb, __func__, "Couldn't clean up the journal");
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-- 
cgit v1.2.3


From 6c20ec850360bc6e5c66a787f0523a80450d65ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 28 Oct 2008 21:08:20 -0400
Subject: jbd2: Call the commit callback before the transaction could get
 dropped

The transaction can potentially get dropped if there are no buffers
that need to be written.  Make sure we call the commit callback before
potentially deciding to drop the transaction.  Also avoid
dereferencing the commit_transaction pointer in the marker for the
same reason.

This patch fixes the bug reported by Eric Paris at:
http://bugzilla.kernel.org/show_bug.cgi?id=11838

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Eric Paris <eparis@redhat.com>
---
 fs/jbd2/commit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8b119e16aa3..ebc667bc54a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -974,6 +974,9 @@ restart_loop:
 	journal->j_committing_transaction = NULL;
 	spin_unlock(&journal->j_state_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -995,11 +998,8 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
-	if (journal->j_commit_callback)
-		journal->j_commit_callback(journal, commit_transaction);
-
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_devname, journal->j_commit_sequence,
 		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-- 
cgit v1.2.3


From 8c3bf8a01c005385e9be0bc992e10abfb355278c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Oct 2008 00:08:12 -0400
Subject: merge ext4_claim_free_blocks & ext4_has_free_blocks

Mingming pointed out that ext4_claim_free_blocks & ext4_has_free_blocks
are largely cut & pasted; they can be collapsed/merged as follows.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 57 ++++++++++++++++----------------------------------------
 fs/ext4/ext4.h   |  3 +--
 2 files changed, 17 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b9821be709b..e28203ec45b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -589,8 +589,15 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	return;
 }
 
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-						s64 nblocks)
+/**
+ * ext4_has_free_blocks()
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of needed blocks
+ *
+ * Check if filesystem has nblocks free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks;
 	s64 root_blocks = 0;
@@ -620,53 +627,21 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 	 */
 	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
 		/* we don't have free space */
-		return -ENOSPC;
+		return 0;
 
-	/* Add the blocks to nblocks */
-	percpu_counter_add(dbc, nblocks);
-	return 0;
+	return 1;
 }
 
-/**
- * ext4_has_free_blocks()
- * @sbi:	in-core super block structure.
- * @nblocks:	number of neeed blocks
- *
- * Check if filesystem has free blocks available for allocation.
- * Return the number of blocks avaible for allocation for this request
- * On success, return nblocks
- */
-ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 						s64 nblocks)
 {
-	s64 free_blocks, dirty_blocks;
-	s64 root_blocks = 0;
-	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
-	free_blocks  = percpu_counter_read_positive(fbc);
-	dirty_blocks = percpu_counter_read_positive(dbc);
-
-	if (!capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-		root_blocks = ext4_r_blocks_count(sbi->s_es);
-
-	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum(fbc);
-		dirty_blocks = percpu_counter_sum(dbc);
-	}
-	if (free_blocks <= (root_blocks + dirty_blocks))
-		/* we don't have free space */
+	if (ext4_has_free_blocks(sbi, nblocks)) {
+		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
 		return 0;
-
-	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
-		return free_blocks - (root_blocks + dirty_blocks);
-	return nblocks;
+	} else
+		return -ENOSPC;
 }
 
-
 /**
  * ext4_should_retry_alloc()
  * @sb:			super block
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4880cc3e672..b0537c82702 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1003,8 +1003,7 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-					 s64 nblocks);
+extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-- 
cgit v1.2.3


From a996031c87e093017c0763326a08896a3a4817f4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Oct 2008 00:08:17 -0400
Subject: delay capable() check in ext4_has_free_blocks()

As reported by Eric Paris, the capable() check in ext4_has_free_blocks()
sometimes causes SELinux denials.

We can rearrange the logic so that we only try to use the root-reserved
blocks when necessary, and even then we can move the capable() test
to last, to avoid the check most of the time.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e28203ec45b..d2003cdc36a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -599,18 +599,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
  */
 int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
-	s64 free_blocks, dirty_blocks;
-	s64 root_blocks = 0;
+	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
 	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
 	free_blocks  = percpu_counter_read_positive(fbc);
 	dirty_blocks = percpu_counter_read_positive(dbc);
-
-	if (!capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-		root_blocks = ext4_r_blocks_count(sbi->s_es);
+	root_blocks = ext4_r_blocks_count(sbi->s_es);
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
@@ -623,13 +618,20 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 		}
 	}
 	/* Check whether we have space after
-	 * accounting for current dirty blocks
+	 * accounting for current dirty blocks & root reserved blocks.
 	 */
-	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
-		/* we don't have free space */
-		return 0;
+	if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+		return 1;
 
-	return 1;
+	/* Hm, nope.  Are (enough) root reserved blocks available? */
+	if (sbi->s_resuid == current->fsuid ||
+	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+	    capable(CAP_SYS_RESOURCE)) {
+		if (free_blocks >= (nblocks + dirty_blocks))
+			return 1;
+	}
+
+	return 0;
 }
 
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-- 
cgit v1.2.3


From ae05f269400533cbb32bfba131ab528d78dffd16 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 28 Oct 2008 15:21:40 -0400
Subject: NFS: Convert nfs_attr_generation_counter into an atomic_long

The most important property we need from nfs_attr_generation_counter is
monotonicity, which is not guaranteed by the current system of smp memory
barriers. We should convert it to an atomic_long_t, and drop the memory
barriers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index dc52793ff8f..d22eb383e1c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -908,21 +908,16 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
-static unsigned long nfs_attr_generation_counter;
+static atomic_long_t nfs_attr_generation_counter;
 
 static unsigned long nfs_read_attr_generation_counter(void)
 {
-	smp_rmb();
-	return nfs_attr_generation_counter;
+	return atomic_long_read(&nfs_attr_generation_counter);
 }
 
 unsigned long nfs_inc_attr_generation_counter(void)
 {
-	unsigned long ret;
-	smp_rmb();
-	ret = ++nfs_attr_generation_counter;
-	smp_wmb();
-	return ret;
+	return atomic_long_inc_return(&nfs_attr_generation_counter);
 }
 
 void nfs_fattr_init(struct nfs_fattr *fattr)
-- 
cgit v1.2.3


From edf1ae403896cb7750800508b14996ba6be39a53 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 29 Oct 2008 00:47:57 +0000
Subject: [CIFS] Reduce number of socket retries in large write path

CIFS in some heavy stress conditions cifs could get EAGAIN
repeatedly in smb_send2 which led to repeated retries and eventually
failure of large writes which could lead to data corruption.

There are three changes that were suggested by various network
developers:

1) convert cifs from non-blocking to blocking tcp sendmsg
(we left in the retry on failure)
2) change cifs to not set sendbuf and rcvbuf size for the socket
(let tcp autotune the buffer sizes since that works much better
in the TCP stack now)
3) if we have a partial frame sent in smb_send2, mark the tcp
session as invalid (close the socket and reconnect) so we do
not corrupt the remaining part of the SMB with the beginning
of the next SMB.

This does not appear to hurt performance measurably and has
been run in various scenarios, but it definately removes
a corruption that we were seeing in some high stress
test cases.

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES     |  6 +++++-
 fs/cifs/cifsglob.h  |  2 ++
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/connect.c   | 50 +++++++++++++++++++++++++++++++++++++-------------
 fs/cifs/transport.c | 41 +++++++++++++++++++++++++++++++----------
 5 files changed, 76 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8f528ea24c4..8855331b2fb 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,11 @@ Various fixes to make delete of open files behavior more predictable
 (when delete of an open file fails we mark the file as "delete-on-close"
 in a way that more servers accept, but only if we can first rename the
 file to a temporary name).  Add experimental support for more safely
-handling fcntl(F_SETLEASE).
+handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests).
 
 Version 1.54
 ------------
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c791e5b5a91..1cb1189f24e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -141,6 +141,8 @@ struct TCP_Server_Info {
 	char versionMajor;
 	char versionMinor;
 	bool svlocal:1;			/* local server or remote */
+	bool noblocksnd;		/* use blocking sendmsg */
+	bool noautotune;		/* do not autotune send buf sizes */
 	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0cff7fe986e..6f21ecb85ce 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,7 +36,7 @@ extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
 extern int smb_send(struct socket *, struct smb_hdr *,
-			unsigned int /* length */ , struct sockaddr *);
+			unsigned int /* length */ , struct sockaddr *, bool);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71b7661e226..e9f9248cb3f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -92,6 +92,8 @@ struct smb_vol {
 	bool seal:1;       /* request transport encryption on share */
 	bool nodfs:1;      /* Do not request DFS, even if available */
 	bool local_lease:1; /* check leases only on local system, not remote */
+	bool noblocksnd:1;
+	bool noautotune:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -102,9 +104,11 @@ struct smb_vol {
 static int ipv4_connect(struct sockaddr_in *psin_server,
 			struct socket **csocket,
 			char *netb_name,
-			char *server_netb_name);
+			char *server_netb_name,
+			bool noblocksnd,
+			bool nosndbuf); /* ipv6 never set sndbuf size */
 static int ipv6_connect(struct sockaddr_in6 *psin_server,
-			struct socket **csocket);
+			struct socket **csocket, bool noblocksnd);
 
 
 	/*
@@ -191,12 +195,13 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		try_to_freeze();
 		if (server->protocolType == IPV6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
-					  &server->ssocket);
+					  &server->ssocket, server->noautotune);
 		} else {
 			rc = ipv4_connect(&server->addr.sockAddr,
 					&server->ssocket,
 					server->workstation_RFC1001_name,
-					server->server_RFC1001_name);
+					server->server_RFC1001_name,
+					server->noblocksnd, server->noautotune);
 		}
 		if (rc) {
 			cFYI(1, ("reconnect error %d", rc));
@@ -1192,6 +1197,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			/* ignore */
 		} else if (strnicmp(data, "rw", 2) == 0) {
 			vol->rw = true;
+		} else if (strnicmp(data, "noblocksend", 11) == 0) {
+			vol->noblocksnd = 1;
+		} else if (strnicmp(data, "noautotune", 10) == 0) {
+			vol->noautotune = 1;
 		} else if ((strnicmp(data, "suid", 4) == 0) ||
 				   (strnicmp(data, "nosuid", 6) == 0) ||
 				   (strnicmp(data, "exec", 4) == 0) ||
@@ -1518,7 +1527,8 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 
 static int
 ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
-	     char *netbios_name, char *target_name)
+	     char *netbios_name, char *target_name,
+	     bool noblocksnd, bool noautotune)
 {
 	int rc = 0;
 	int connected = 0;
@@ -1590,11 +1600,16 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 		 (*csocket)->sk->sk_sndbuf,
 		 (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo));
 	(*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+	if (!noblocksnd)
+		(*csocket)->sk->sk_sndtimeo = 3 * HZ;
+
 	/* make the bufsizes depend on wsize/rsize and max requests */
-	if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
-		(*csocket)->sk->sk_sndbuf = 200 * 1024;
-	if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
-		(*csocket)->sk->sk_rcvbuf = 140 * 1024;
+	if (noautotune) {
+		if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
+			(*csocket)->sk->sk_sndbuf = 200 * 1024;
+		if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
+			(*csocket)->sk->sk_rcvbuf = 140 * 1024;
+	}
 
 	/* send RFC1001 sessinit */
 	if (psin_server->sin_port == htons(RFC1001_PORT)) {
@@ -1631,7 +1646,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 			/* sizeof RFC1002_SESSION_REQUEST with no scope */
 			smb_buf->smb_buf_length = 0x81000044;
 			rc = smb_send(*csocket, smb_buf, 0x44,
-				(struct sockaddr *)psin_server);
+				(struct sockaddr *)psin_server, noblocksnd);
 			kfree(ses_init_buf);
 			msleep(1); /* RFC1001 layer in at least one server
 				      requires very short break before negprot
@@ -1651,7 +1666,8 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 }
 
 static int
-ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
+ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket,
+	     bool noblocksnd)
 {
 	int rc = 0;
 	int connected = 0;
@@ -1720,6 +1736,9 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
 		the default. sock_setsockopt not used because it expects
 		user space buffer */
 	(*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+	if (!noblocksnd)
+		(*csocket)->sk->sk_sndtimeo = 3 * HZ;
+
 
 	return rc;
 }
@@ -1983,11 +2002,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
 			/* other OS never observed in Wild doing 139 with v6 */
-			rc = ipv6_connect(&sin_server6, &csocket);
+			rc = ipv6_connect(&sin_server6, &csocket,
+					volume_info.noblocksnd);
 		} else
 			rc = ipv4_connect(&sin_server, &csocket,
 				  volume_info.source_rfc1001_name,
-				  volume_info.target_rfc1001_name);
+				  volume_info.target_rfc1001_name,
+				  volume_info.noblocksnd,
+				  volume_info.noautotune);
 		if (rc < 0) {
 			cERROR(1, ("Error connecting to IPv4 socket. "
 				   "Aborting operation"));
@@ -2002,6 +2024,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			sock_release(csocket);
 			goto out;
 		} else {
+			srvTcp->noblocksnd = volume_info.noblocksnd;
+			srvTcp->noautotune = volume_info.noautotune;
 			memcpy(&srvTcp->addr.sockAddr, &sin_server,
 				sizeof(struct sockaddr_in));
 			atomic_set(&srvTcp->inFlight, 0);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bf0e6d8e382..ba4d66644eb 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -161,7 +161,7 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 
 int
 smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-	 unsigned int smb_buf_length, struct sockaddr *sin)
+	 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
 {
 	int rc = 0;
 	int i = 0;
@@ -178,7 +178,10 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+	if (noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
 
 	/* smb header is converted in header_assemble. bcc and rest of SMB word
 	   area, and byte area if necessary, is converted to littleendian in
@@ -229,8 +232,8 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 }
 
 static int
-smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
-	  struct sockaddr *sin)
+smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
+	  struct sockaddr *sin, bool noblocksnd)
 {
 	int rc = 0;
 	int i = 0;
@@ -240,6 +243,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	unsigned int total_len;
 	int first_vec = 0;
 	unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+	struct socket *ssocket = server->ssocket;
 
 	if (ssocket == NULL)
 		return -ENOTSOCK; /* BB eventually add reconnect code here */
@@ -248,7 +252,10 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+	if (noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
 
 	/* smb header is converted in header_assemble. bcc and rest of SMB word
 	   area, and byte area if necessary, is converted to littleendian in
@@ -312,6 +319,16 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 		i = 0; /* in case we get ENOSPC on the next send */
 	}
 
+	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
+		cFYI(1, ("partial send (%d remaining), terminating session",
+			total_len));
+		/* If we have only sent part of an SMB then the next SMB
+		   could be taken as the remainder of this one.  We need
+		   to kill the socket so the server throws away the partial
+		   SMB */
+		server->tcpStatus = CifsNeedReconnect;
+	}
+
 	if (rc < 0) {
 		cERROR(1, ("Error %d sending data on socket to server", rc));
 	} else
@@ -518,8 +535,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send2(ses->server->ssocket, iov, n_vec,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+	rc = smb_send2(ses->server, iov, n_vec,
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		       ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -711,7 +729,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	atomic_inc(&ses->server->inSend);
 #endif
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -851,7 +870,8 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
 		return rc;
 	}
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-	      (struct sockaddr *) &(ses->server->addr.sockAddr));
+	      (struct sockaddr *) &(ses->server->addr.sockAddr),
+	      ses->server->noblocksnd);
 	up(&ses->server->tcpSem);
 	return rc;
 }
@@ -941,7 +961,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	atomic_inc(&ses->server->inSend);
 #endif
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
-- 
cgit v1.2.3


From 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 29 Oct 2008 14:00:55 -0700
Subject: fs: remove prepare_write/commit_write

Nothing uses prepare_write or commit_write. Remove them from the tree
completely.

[akpm@linux-foundation.org: schedule simple_prepare_write() for unexporting]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c  | 2 +-
 fs/libfs.c      | 2 +-
 fs/ocfs2/file.c | 3 +--
 fs/splice.c     | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19eafbe3c37..2b2eec1283b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 	if (rw == WRITE) {
 		/*
-		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
 		 * so we need to update the ->mmu_private to block boundary.
 		 *
 		 * But we must fill the remaining area or hole by nul for
diff --git a/fs/libfs.c b/fs/libfs.c
index 74688598bcf..e960a832190 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8d3225a7807..7efe937a415 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -679,8 +679,7 @@ leave:
 
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
- * ->commit_write(). */
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
diff --git a/fs/splice.c b/fs/splice.c
index a1e701c2715..1abab5cee4b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	};
 
 	/*
-	 * The actor worker might be calling ->prepare_write and
-	 * ->commit_write. Most of the time, these expect i_mutex to
+	 * The actor worker might be calling ->write_begin and
+	 * ->write_end. Most of the time, these expect i_mutex to
 	 * be held. Since this may result in an ABBA deadlock with
 	 * pipe->inode, we have to order lock acquiry here.
 	 */
-- 
cgit v1.2.3


From 87b811c3f96559e466403e22b1fa99d472571625 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 29 Oct 2008 14:01:08 -0700
Subject: ecryptfs: fix memory corruption when storing crypto info in xattrs

When ecryptfs allocates space to write crypto headers into, before copying
it out to file headers or to xattrs, it looks at the value of
crypt_stat->num_header_bytes_at_front to determine how much space it
needs.  This is also used as the file offset to the actual encrypted data,
so for xattr-stored crypto info, the value was zero.

So, we kzalloc'd 0 bytes, and then ran off to write to that memory.
(Which returned as ZERO_SIZE_PTR, so we explode quickly).

The right answer is to always allocate a page to write into; the current
code won't ever write more than that (this is enforced by the
(PAGE_CACHE_SIZE - offset) length in the call to
ecryptfs_generate_key_packet_set).  To be explicit about this, we now send
in a "max" parameter, rather than magically using PAGE_CACHE_SIZE there.

Also, since the pointer we pass down the callchain eventually gets the
virt_to_page() treatment, we should be using a alloc_page variant, not
kzalloc (see also 7fcba054373d5dfc43d26e243a5c9b92069972ee)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 06db79d05c1..6046239465a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1251,6 +1251,7 @@ struct kmem_cache *ecryptfs_header_cache_2;
 /**
  * ecryptfs_write_headers_virt
  * @page_virt: The virtual address to write the headers to
+ * @max: The size of memory allocated at page_virt
  * @size: Set to the number of bytes written by this function
  * @crypt_stat: The cryptographic context
  * @ecryptfs_dentry: The eCryptfs dentry
@@ -1278,7 +1279,8 @@ struct kmem_cache *ecryptfs_header_cache_2;
  *
  * Returns zero on success
  */
-static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
+static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
+				       size_t *size,
 				       struct ecryptfs_crypt_stat *crypt_stat,
 				       struct dentry *ecryptfs_dentry)
 {
@@ -1296,7 +1298,7 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
 	offset += written;
 	rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
 					      ecryptfs_dentry, &written,
-					      PAGE_CACHE_SIZE - offset);
+					      max - offset);
 	if (rc)
 		ecryptfs_printk(KERN_WARNING, "Error generating key packet "
 				"set; rc = [%d]\n", rc);
@@ -1368,14 +1370,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		goto out;
 	}
 	/* Released in this function */
-	virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL);
+	virt = (char *)get_zeroed_page(GFP_KERNEL);
 	if (!virt) {
 		printk(KERN_ERR "%s: Out of memory\n", __func__);
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = ecryptfs_write_headers_virt(virt, &size, crypt_stat,
-					 ecryptfs_dentry);
+	rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size,
+					 crypt_stat, ecryptfs_dentry);
 	if (unlikely(rc)) {
 		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
 		       __func__, rc);
@@ -1393,8 +1395,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		goto out_free;
 	}
 out_free:
-	memset(virt, 0, crypt_stat->num_header_bytes_at_front);
-	kfree(virt);
+	free_page((unsigned long)virt);
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From e74481e23283fb080d4591c258de20785cc3b6c3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Oct 2008 14:01:10 -0700
Subject: fs: remove excess kernel-doc

Delete excess kernel-doc notation in fs/ subdirectory:

Warning(linux-2.6.27-git10//fs/jbd/transaction.c:886): Excess function parameter or struct member 'credits' description in 'journal_get_undo_access'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d15cd6e7251..60d4c32c880 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -860,7 +860,6 @@ out:
  * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
  * @handle: transaction
  * @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
  *
  * Sometimes there is a need to distinguish between metadata which has
  * been committed to disk and that which has not.  The ext3fs code uses
-- 
cgit v1.2.3


From 61de800d33af585cb7e6f27b5cdd51029c6855cb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 30 Oct 2008 20:15:22 +0000
Subject: [CIFS] fix error in smb_send2

smb_send2 exit logic was strange, and with the previous change
could cause us to fail large
smb writes when all of the smb was not sent as one chunk.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c   | 2 +-
 fs/cifs/file.c      | 2 +-
 fs/cifs/transport.c | 7 +++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 843a85fb8b9..d5eac48fc41 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1536,7 +1536,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	__u32 bytes_sent;
 	__u16 byte_count;
 
-	/* cFYI(1,("write at %lld %d bytes",offset,count));*/
+	/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 62d8bd8f14c..ead1a3bb025 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1824,7 +1824,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	pTcon = cifs_sb->tcon;
 
 	pagevec_init(&lru_pvec, 0);
-		cFYI(DBG2, ("rpages: num pages %d", num_pages));
+	cFYI(DBG2, ("rpages: num pages %d", num_pages));
 	for (i = 0; i < num_pages; ) {
 		unsigned contig_pages;
 		struct page *tmp_page;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ba4d66644eb..ff8243a8fe3 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -290,8 +290,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 		if (rc < 0)
 			break;
 
-		if (rc >= total_len) {
-			WARN_ON(rc > total_len);
+		if (rc == total_len) {
+			total_len = 0;
+			break;
+		} else if (rc > total_len) {
+			cERROR(1, ("sent %d requested %d", rc, total_len));
 			break;
 		}
 		if (rc == 0) {
-- 
cgit v1.2.3


From 8d7c4203c681a3ec359eccff4e53bc8c0ccf403b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 30 Oct 2008 13:48:33 -0400
Subject: nfsd: fix failure to set eof in readdir in some situations

Before 14f7dd632011bb89c035722edd6ea0d90ca6b078 "[PATCH] Copy XFS
readdir hack into nfsd code", readdir_cd->err was reset to eof before
each call to vfs_readdir; afterwards, it is set only once.  Similarly,
c002a6c7977320f95b5edede5ce4e0eeecf291ff "[PATCH] Optimise NFS readdir
hack slightly", can cause us to exit without nfserr_eof set.  Fix this.

This ensures the "eof" bit is set when needed in readdir replies.  (The
particular case I saw was an nfsv4 readdir of an empty directory, which
returned with no entries (the protocol requires "." and ".." to be
filtered out), but with eof unset.)

Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0bc56f6d927..848a03e83a4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,6 +1912,7 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
 		offset = vfs_llseek(file, 0, SEEK_CUR);
+		cdp->err = nfserr_eof;
 		if (!buf.full)
 			break;
 	}
-- 
cgit v1.2.3


From d7dc61d0a70371b1c6557ea8ffbc60fff94c8168 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@ORACLE.COM>
Date: Thu, 23 Oct 2008 00:50:35 -0400
Subject: NLM: Set address family before calling nlm_host_rebooted()

The nlm_host_rebooted() function uses nlm_cmp_addr() to find an
nsm_handle that matches the rebooted peer.  In order for this to work,
the passed-in address must have a proper address family.

This fixes a post-2.6.28 regression introduced by commit 781b61a6, which
added AF_INET6 support to nlm_cmp_addr().  Before that commit,
nlm_cmp_addr() didn't care about the address family; it compared only
the sin_addr.s_addr field for equality.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c | 1 +
 fs/lockd/svcproc.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 014f6ce4817..4dfdcbc6bf6 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -434,6 +434,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	 * reclaim all locks we hold on this server.
 	 */
 	memset(&saddr, 0, sizeof(saddr));
+	saddr.sin_family = AF_INET;
 	saddr.sin_addr.s_addr = argp->addr;
 	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 548b0bb2b84..3ca89e2a938 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -466,6 +466,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	 * reclaim all locks we hold on this server.
 	 */
 	memset(&saddr, 0, sizeof(saddr));
+	saddr.sin_family = AF_INET;
 	saddr.sin_addr.s_addr = argp->addr;
 	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
-- 
cgit v1.2.3


From b27cf88e9592953ae292d05324887f2f44979433 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Fri, 31 Oct 2008 14:52:24 +0000
Subject: [JFFS2] Fix lack of locking in thread_should_wake()

The thread_should_wake() function trawls through the list of 'very
dirty' eraseblocks, determining whether the background GC thread should
wake. Doing this without holding the appropriate locks is a bad idea.

OLPC Trac #8615

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@kernel.org
---
 fs/jffs2/background.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 8adebd3e43c..3cceef4ad2b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -85,15 +85,15 @@ static int jffs2_garbage_collect_thread(void *_c)
 	for (;;) {
 		allow_signal(SIGHUP);
 	again:
+		spin_lock(&c->erase_completion_lock);
 		if (!jffs2_thread_should_wake(c)) {
 			set_current_state (TASK_INTERRUPTIBLE);
+			spin_unlock(&c->erase_completion_lock);
 			D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n"));
-			/* Yes, there's a race here; we checked jffs2_thread_should_wake()
-			   before setting current->state to TASK_INTERRUPTIBLE. But it doesn't
-			   matter - We don't care if we miss a wakeup, because the GC thread
-			   is only an optimisation anyway. */
 			schedule();
-		}
+		} else
+			spin_unlock(&c->erase_completion_lock);
+			
 
 		/* This thread is purely an optimisation. But if it runs when
 		   other things could be running, it actually makes things a
-- 
cgit v1.2.3


From 233e70f4228e78eb2f80dc6650f65d3ae3dbf17c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 31 Oct 2008 23:28:30 +0000
Subject: saner FASYNC handling on file close

As it is, all instances of ->release() for files that have ->fasync()
need to remember to evict file from fasync lists; forgetting that
creates a hole and we actually have a bunch that *does* forget.

So let's keep our lives simple - let __fput() check FASYNC in
file->f_flags and call ->fasync() there if it's been set.  And lose that
crap in ->release() instances - leaving it there is still valid, but we
don't have to bother anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c   | 4 ++++
 fs/fuse/dev.c     | 1 -
 fs/inotify_user.c | 3 ---
 fs/pipe.c         | 3 ---
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index efc06faede6..5ad0eca6eea 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -269,6 +269,10 @@ void __fput(struct file *file)
 	eventpoll_release(file);
 	locks_remove_flock(file);
 
+	if (unlikely(file->f_flags & FASYNC)) {
+		if (file->f_op && file->f_op->fasync)
+			file->f_op->fasync(-1, file, 0);
+	}
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 87250b6a868..b72361479be 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1056,7 +1056,6 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		spin_unlock(&fc->lock);
-		fasync_helper(-1, file, 0, &fc->fasync);
 		fuse_conn_put(fc);
 	}
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index d85c7d931cd..d367e9b9286 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -537,9 +537,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
 		inotify_dev_event_dequeue(dev);
 	mutex_unlock(&dev->ev_mutex);
 
-	if (file->f_flags & FASYNC)
-		inotify_fasync(-1, file, 0);
-
 	/* free this device: the put matching the get in inotify_init() */
 	put_inotify_dev(dev);
 
diff --git a/fs/pipe.c b/fs/pipe.c
index fcba6542b8d..7aea8b89baa 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -717,14 +717,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 static int
 pipe_read_release(struct inode *inode, struct file *filp)
 {
-	pipe_read_fasync(-1, filp, 0);
 	return pipe_release(inode, 1, 0);
 }
 
 static int
 pipe_write_release(struct inode *inode, struct file *filp)
 {
-	pipe_write_fasync(-1, filp, 0);
 	return pipe_release(inode, 0, 1);
 }
 
@@ -733,7 +731,6 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
 {
 	int decr, decw;
 
-	pipe_rdwr_fasync(-1, filp, 0);
 	decr = (filp->f_mode & FMODE_READ) != 0;
 	decw = (filp->f_mode & FMODE_WRITE) != 0;
 	return pipe_release(inode, decr, decw);
-- 
cgit v1.2.3


From e219cca082f52e7dfea41f3be264b7b5eb204227 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 22:37:59 -0500
Subject: jbd: don't give up looking for space so easily in
 __log_wait_for_space

Commit be07c4ed introducd a regression because it assumed that if
there were no transactions ready to be checkpointed, that no progress
could be made on making space available in the journal, and so the
journal should be aborted.  This assumption is false; it could be the
case that simply calling cleanup_journal_tail() will recover the
necessary space, or, for small journals, the currently committing
transaction could be responsible for chewing up the required space in
the log, so we need to wait for the currently committing transaction
to finish before trying to force a checkpoint operation.

This patch fixes the bug reported by Meelis Roos at:
http://bugzilla.kernel.org/show_bug.cgi?id=11937

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Duane Griffin <duaneg@dghda.com>
Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
---
 fs/jbd/checkpoint.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 1bd8d4acc6f..61f32f3868c 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -115,7 +115,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
  */
 void __log_wait_for_space(journal_t *journal)
 {
-	int nblocks;
+	int nblocks, space_left;
 	assert_spin_locked(&journal->j_state_lock);
 
 	nblocks = jbd_space_needed(journal);
@@ -128,25 +128,42 @@ void __log_wait_for_space(journal_t *journal)
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock. If there are no
-		 * outstanding transactions there is nothing to checkpoint and
-		 * we can't make progress. Abort the journal in this case.
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
 		 */
 		spin_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
-		if (__log_space_left(journal) < nblocks) {
+		space_left = __log_space_left(journal);
+		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
 
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				log_do_checkpoint(journal);
+			} else if (cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				log_wait_commit(journal, tid);
 			} else {
-				printk(KERN_ERR "%s: no transactions\n",
-				       __func__);
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space\n", __func__);
+				WARN_ON(1);
 				journal_abort(journal, 0);
 			}
-
 			spin_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
-- 
cgit v1.2.3


From 8c3f25d8950c3e9fe6c9849f88679b3f2a071550 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 22:38:07 -0500
Subject: jbd2: don't give up looking for space so easily in
 __jbd2_log_wait_for_space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 23f8b79e introducd a regression because it assumed that if
there were no transactions ready to be checkpointed, that no progress
could be made on making space available in the journal, and so the
journal should be aborted.  This assumption is false; it could be the
case that simply calling jbd2_cleanup_journal_tail() will recover the
necessary space, or, for small journals, the currently committing
transaction could be responsible for chewing up the required space in
the log, so we need to wait for the currently committing transaction
to finish before trying to force a checkpoint operation.

This patch fixes a bug reported by Mihai Harpau at:
https://bugzilla.redhat.com/show_bug.cgi?id=469582

This patch fixes a bug reported by François Valenduc at:
http://bugzilla.kernel.org/show_bug.cgi?id=11840

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Duane Griffin <duaneg@dghda.com>
Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
---
 fs/jbd2/checkpoint.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9203c3332f1..9497718fe92 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -116,7 +116,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
  */
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
-	int nblocks;
+	int nblocks, space_left;
 	assert_spin_locked(&journal->j_state_lock);
 
 	nblocks = jbd_space_needed(journal);
@@ -129,25 +129,43 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock. If there are no
-		 * outstanding transactions there is nothing to checkpoint and
-		 * we can't make progress. Abort the journal in this case.
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
 		 */
 		spin_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
-		if (__jbd2_log_space_left(journal) < nblocks) {
+		space_left = __jbd2_log_space_left(journal);
+		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
 
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				jbd2_log_do_checkpoint(journal);
+			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				jbd2_log_wait_commit(journal, tid);
 			} else {
-				printk(KERN_ERR "%s: no transactions\n",
-				       __func__);
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space in %s\n", __func__,
+				       journal->j_devname);
+				WARN_ON(1);
 				jbd2_journal_abort(journal, 0);
 			}
-
 			spin_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
-- 
cgit v1.2.3


From 2423840ded13e6d3b52d88aff8d033bb78fafd08 Mon Sep 17 00:00:00 2001
From: Sami Liedes <sliedes@cc.hut.fi>
Date: Sun, 2 Nov 2008 19:23:30 -0500
Subject: jbd2: deregister proc on failure in jbd2_journal_init_inode

jbd2_journal_init_inode() does not call jbd2_stats_proc_exit() on all
failure paths after calling jbd2_stats_proc_init(). This leaves
dangling references to the fs in proc.

This patch fixes a bug reported by Sami Leides at:
http://bugzilla.kernel.org/show_bug.cgi?id=11493

Signed-off-by: Sami Liedes <sliedes@cc.hut.fi>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 783de118de9..e70d657a19f 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1089,6 +1089,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
+		jbd2_stats_proc_exit(journal);
 		kfree(journal);
 		return NULL;
 	}
@@ -1098,6 +1099,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
 		       __func__);
+		jbd2_stats_proc_exit(journal);
 		kfree(journal);
 		return NULL;
 	}
-- 
cgit v1.2.3


From ae6884a9da56f8921e432e663b4ccb4a1851b2ea Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 3 Nov 2008 14:05:08 -0500
Subject: cifs: fix renaming one hardlink on top of another

cifs: fix renaming one hardlink on top of another

POSIX says that renaming one hardlink on top of another to the same
inode is a no-op. We had the logic mostly right, but forgot to clear
the return code.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d54fa8aeaea..ff8c68de4a9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1361,9 +1361,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 		if (tmprc == 0 && (info_buf_source->UniqueId ==
-				   info_buf_target->UniqueId))
+				   info_buf_target->UniqueId)) {
 			/* same file, POSIX says that this is a noop */
+			rc = 0;
 			goto cifs_rename_exit;
+		}
 	} /* else ... BB we could add the same check for Windows by
 		     checking the UniqueId via FILE_INTERNAL_INFO */
 
-- 
cgit v1.2.3


From ae2d9fb18e575ed37ffc241ece4bf68f0be4ae32 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 4 Nov 2008 09:10:50 -0500
Subject: ext4: fix missing ext4_unlock_group in error path

If we try to free a block which is already freed, the code was
returning without first unlocking the group.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dfe17a13405..444ad998f72 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4441,6 +4441,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
+			ext4_unlock_group(sb, group);
 			ext4_error(sb, __func__,
 			    "Double free of blocks %d (%d %d)\n",
 			    block, entry->start_blk, entry->count);
-- 
cgit v1.2.3


From d94e99a64c3beece22dbfb2b335771a59184eb0a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 4 Nov 2008 09:11:26 -0500
Subject: ext4: Convert to host order before using the values.

Use le16_to_cpu to read the s_reserved_gdt_blocks values
from super block.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 994859df010..e27acd18b4b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1458,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
-			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
-			   groups_per_flex;
+			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
 	sbi->s_flex_groups = kzalloc(flex_group_count *
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
-- 
cgit v1.2.3


From 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 3 Nov 2008 18:10:55 -0500
Subject: ext4: wait on all pending commits in ext4_sync_fs()

In ext4_sync_fs, we only wait for a commit to finish if we started it,
but there may be one already in progress which will not be synced.

In the case of a data=ordered umount with pending long symlinks which
are delayed due to a long list of other I/O on the backing block
device, this causes the buffer associated with the long symlinks to
not be moved to the inode dirty list in the second phase of
fsync_super.  Then, before they can be dirtied again, kjournald exits,
seeing the UMOUNT flag and the dirty pages are never written to the
backing block device, causing long symlink corruption and exposing new
or previously freed block data to userspace.

To ensure all commits are synced, we flush all journal commits now
when sync_fs'ing ext4.

Signed-off-by: Arthur Jones <ajones@riverbed.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
---
 fs/ext4/super.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e27acd18b4b..e4a241c65db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2884,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb)
 /*
  * Ext4 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
  */
-
 static void ext4_write_super(struct super_block *sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
@@ -2899,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb)
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
-	tid_t target;
+	int ret = 0;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
-	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
-		if (wait)
-			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
-	}
-	return 0;
+	if (wait)
+		ret = ext4_force_commit(sb);
+	else
+		jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From dc8a0843a435b2c0891e7eaea64faaf1ebec9b11 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 5 Nov 2008 23:21:16 +0100
Subject: [JFFS2] fix race condition in jffs2_lzo_compress()

deflate_mutex protects the globals lzo_mem and lzo_compress_buf.  However,
jffs2_lzo_compress() unlocks deflate_mutex _before_ it has copied out the
compressed data from lzo_compress_buf.  Correct this by moving the mutex
unlock after the copy.

In addition, document what deflate_mutex actually protects.

Cc: stable@kernel.org
Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Richard Purdie <rpurdie@openedhand.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/compr_lzo.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 47b045797e4..90cb60d0978 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -19,7 +19,7 @@
 
 static void *lzo_mem;
 static void *lzo_compress_buf;
-static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(deflate_mutex);	/* for lzo_mem and lzo_compress_buf */
 
 static void free_workspace(void)
 {
@@ -49,18 +49,21 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 
 	mutex_lock(&deflate_mutex);
 	ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem);
-	mutex_unlock(&deflate_mutex);
-
 	if (ret != LZO_E_OK)
-		return -1;
+		goto fail;
 
 	if (compress_size > *dstlen)
-		return -1;
+		goto fail;
 
 	memcpy(cpage_out, lzo_compress_buf, compress_size);
-	*dstlen = compress_size;
+	mutex_unlock(&deflate_mutex);
 
+	*dstlen = compress_size;
 	return 0;
+
+ fail:
+	mutex_unlock(&deflate_mutex);
+	return -1;
 }
 
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-- 
cgit v1.2.3


From 89f97496e81d2112b5e41416fe3020688c443818 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 5 Nov 2008 10:21:06 +0100
Subject: block: fix __blkdev_get() for removable devices

Commit 0762b8bde9729f10f8e6249809660ff2ec3ad735 moved disk_get_part()
in front of recursive get on the whole disk, which caused removable
devices to try disk_get_part() before rescanning after a new media is
inserted, which might fail legit open attempts or give the old
partition.

This patch fixes the problem by moving disk_get_part() after
__blkdev_get() on the whole disk.

This problem was spotted by Borislav Petkov.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88a776fa0ef..db831efbdbb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -986,7 +986,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
-	struct hd_struct *part = NULL;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1004,24 +1003,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		return ret;
 	}
 
-	ret = -ENXIO;
-
 	lock_kernel();
 
+	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out_unlock_kernel;
-	part = disk_get_part(disk, partno);
-	if (!part)
-		goto out_unlock_kernel;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
-		bdev->bd_part = part;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
+
+			ret = -ENXIO;
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!bdev->bd_part)
+				goto out_clear;
+
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
 				if (ret)
@@ -1049,18 +1049,17 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			bdev->bd_contains = whole;
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
+			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
-			    !part || !part->nr_sects) {
+			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
 				ret = -ENXIO;
 				goto out_clear;
 			}
-			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
+			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
-		disk_put_part(part);
 		put_disk(disk);
 		module_put(disk->fops->owner);
-		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
@@ -1080,6 +1079,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return 0;
 
  out_clear:
+	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
@@ -1091,7 +1091,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_kernel:
 	unlock_kernel();
 
-	disk_put_part(part);
 	if (disk)
 		module_put(disk->fops->owner);
 	put_disk(disk);
-- 
cgit v1.2.3


From ac51d83705c2a38c71f39cde99708b14e6212a60 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 16:49:36 -0500
Subject: ext4: calculate journal credits correctly

This fixes a 2.6.27 regression which was introduced in commit a02908f1.

We weren't passing the chunk parameter down to the two subections,
ext4_indirect_trans_blocks() and ext4_ext_index_trans_blocks(), with
the result that massively overestimate the amount of credits needed by
ext4_da_writepages, especially in the non-extents case.  This causes
failures especially on /boot partitions, which tend to be small and
non-extent using since GRUB doesn't handle extents.

This patch fixes the bug reported by Joseph Fannin at:
http://bugzilla.kernel.org/show_bug.cgi?id=11964

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbf6953845..5a130b56f1c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4580,9 +4580,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
-		return ext4_indirect_trans_blocks(inode, nrblocks, 0);
-	return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
+
 /*
  * Account for index blocks, block groups bitmaps and block group
  * descriptor blocks if modify datablocks and index blocks
-- 
cgit v1.2.3


From bc9c4068388eea01d3b5da31016879f2341ecec5 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 6 Nov 2008 12:53:22 -0800
Subject: autofs4: correct offset mount expire check

When checking a directory tree in autofs_tree_busy() we can incorrectly
decide that the tree isn't busy.  This happens for the case of an active
offset mount as autofs4_follow_mount() follows past the active offset
mount, which has an open file handle used for expires, causing the file
handle not to count toward the busyness check.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/expire.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cde2f8e8935..4b6fb3f628c 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,12 +56,23 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	mntget(mnt);
 	dget(dentry);
 
-	if (!autofs4_follow_mount(&mnt, &dentry))
+	if (!follow_down(&mnt, &dentry))
 		goto done;
 
-	/* This is an autofs submount, we can't expire it */
-	if (is_autofs4_dentry(dentry))
-		goto done;
+	if (is_autofs4_dentry(dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+		/* This is an autofs submount, we can't expire it */
+		if (sbi->type == AUTOFS_TYPE_INDIRECT)
+			goto done;
+
+		/*
+		 * Otherwise it's an offset mount and we need to check
+		 * if we can umount its mount, if there is one.
+		 */
+		if (!d_mountpoint(dentry))
+			goto done;
+	}
 
 	/* Update the expiry counter if fs is busy */
 	if (!may_umount_tree(mnt)) {
-- 
cgit v1.2.3


From 96b0317906690997c16c7efffbc4c0fafcd6f7f2 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 6 Nov 2008 12:53:23 -0800
Subject: autofs4: collect version check return

The function check_dev_ioctl_version() returns an error code upon fail but
it isn't captured and returned in validate_dev_ioctl() as it should be.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 625abf5422e..33bf8cbfd05 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -128,9 +128,10 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
  */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
-	int err = -EINVAL;
+	int err;
 
-	if (check_dev_ioctl_version(cmd, param)) {
+	err = check_dev_ioctl_version(cmd, param);
+	if (err) {
 		AUTOFS_WARN("invalid device control module version "
 		     "supplied for cmd(0x%08x)", cmd);
 		goto out;
-- 
cgit v1.2.3


From c87591b719737b4e91eb1a9fa8fd55a4ff1886d6 Mon Sep 17 00:00:00 2001
From: Arthur Jones <ajones@riverbed.com>
Date: Thu, 6 Nov 2008 12:53:35 -0800
Subject: ext3: wait on all pending commits in ext3_sync_fs

In ext3_sync_fs, we only wait for a commit to finish if we started it, but
there may be one already in progress which will not be synced.

In the case of a data=ordered umount with pending long symlinks which are
delayed due to a long list of other I/O on the backing block device, this
causes the buffer associated with the long symlinks to not be moved to the
inode dirty list in the second phase of fsync_super.  Then, before they
can be dirtied again, kjournald exits, seeing the UMOUNT flag and the
dirty pages are never written to the backing block device, causing long
symlink corruption and exposing new or previously freed block data to
userspace.

This can be reproduced with a script created
by Eric Sandeen <sandeen@redhat.com>:

	#!/bin/bash

	umount /mnt/test2
	mount /dev/sdb4 /mnt/test2
	rm -f /mnt/test2/*
	dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
	touch
	/mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
	ln -s
	/mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
	/mnt/test2/link
	umount /mnt/test2
	mount /dev/sdb4 /mnt/test2
	ls /mnt/test2/
	umount /mnt/test2

To ensure all commits are synced, we flush all journal commits now when
sync_fs'ing ext3.

Signed-off-by: Arthur Jones <ajones@riverbed.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Cc: <stable@kernel.org>		[2.6.everything]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e5717a4fae6..5dec6d1356c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2390,13 +2390,12 @@ static void ext3_write_super (struct super_block * sb)
 
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-	tid_t target;
-
 	sb->s_dirt = 0;
-	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
-		if (wait)
-			log_wait_commit(EXT3_SB(sb)->s_journal, target);
-	}
+	if (wait)
+		ext3_force_commit(sb);
+	else
+		journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 990e194e69009028e029b7d25da68c38241ec4f0 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:45 -0800
Subject: fat: move fs/vfat/* and fs/msdos/* to fs/fat

This just moves those files, but change link order from MSDOS, VFAT to
VFAT, MSDOS.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile          |    2 -
 fs/fat/Makefile      |    6 +-
 fs/fat/namei_msdos.c |  702 +++++++++++++++++++++++++++++++++
 fs/fat/namei_vfat.c  | 1055 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/msdos/Makefile    |    7 -
 fs/msdos/namei.c     |  702 ---------------------------------
 fs/vfat/Makefile     |    7 -
 fs/vfat/namei.c      | 1055 --------------------------------------------------
 8 files changed, 1762 insertions(+), 1774 deletions(-)
 create mode 100644 fs/fat/namei_msdos.c
 create mode 100644 fs/fat/namei_vfat.c
 delete mode 100644 fs/msdos/Makefile
 delete mode 100644 fs/msdos/namei.c
 delete mode 100644 fs/vfat/Makefile
 delete mode 100644 fs/vfat/namei.c

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 2168c902d5c..d9f8afe6f0c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -81,8 +81,6 @@ obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)		+= coda/
 obj-$(CONFIG_MINIX_FS)		+= minix/
 obj-$(CONFIG_FAT_FS)		+= fat/
-obj-$(CONFIG_MSDOS_FS)		+= msdos/
-obj-$(CONFIG_VFAT_FS)		+= vfat/
 obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index bfb5f06cf2c..e06190322c1 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -3,5 +3,9 @@
 #
 
 obj-$(CONFIG_FAT_FS) += fat.o
+obj-$(CONFIG_VFAT_FS) += vfat.o
+obj-$(CONFIG_MSDOS_FS) += msdos.o
 
-fat-objs := cache.o dir.o fatent.o file.o inode.o misc.o
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+vfat-y := namei_vfat.o
+msdos-y := namei_msdos.o
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
new file mode 100644
index 00000000000..e844b9809d2
--- /dev/null
+++ b/fs/fat/namei_msdos.c
@@ -0,0 +1,702 @@
+/*
+ *  linux/fs/msdos/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *  Rewritten for constant inumbers 1999 by Al Viro
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/buffer_head.h>
+#include <linux/msdos_fs.h>
+#include <linux/smp_lock.h>
+
+/* Characters that are undesirable in an MS-DOS file name */
+static unsigned char bad_chars[] = "*?<>|\"";
+static unsigned char bad_if_strict[] = "+=,; ";
+
+/***** Formats an MS-DOS file name. Rejects invalid names. */
+static int msdos_format_name(const unsigned char *name, int len,
+			     unsigned char *res, struct fat_mount_options *opts)
+	/*
+	 * name is the proposed name, len is its length, res is
+	 * the resulting name, opts->name_check is either (r)elaxed,
+	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
+	 * beginning of name (for hidden files)
+	 */
+{
+	unsigned char *walk;
+	unsigned char c;
+	int space;
+
+	if (name[0] == '.') {	/* dotfile because . and .. already done */
+		if (opts->dotsOK) {
+			/* Get rid of dot - test for it elsewhere */
+			name++;
+			len--;
+		} else
+			return -EINVAL;
+	}
+	/*
+	 * disallow names that _really_ start with a dot
+	 */
+	space = 1;
+	c = 0;
+	for (walk = res; len && walk - res < 8; walk++) {
+		c = *name++;
+		len--;
+		if (opts->name_check != 'r' && strchr(bad_chars, c))
+			return -EINVAL;
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
+			return -EINVAL;
+		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+			return -EINVAL;
+		if (c < ' ' || c == ':' || c == '\\')
+			return -EINVAL;
+	/*
+	 * 0xE5 is legal as a first character, but we must substitute
+	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
+	 * does this.
+	 * It seems that Microsoft hacked DOS to support non-US
+	 * characters after the 0xE5 character was already in use to
+	 * mark deleted files.
+	 */
+		if ((res == walk) && (c == 0xE5))
+			c = 0x05;
+		if (c == '.')
+			break;
+		space = (c == ' ');
+		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
+	}
+	if (space)
+		return -EINVAL;
+	if (opts->name_check == 's' && len && c != '.') {
+		c = *name++;
+		len--;
+		if (c != '.')
+			return -EINVAL;
+	}
+	while (c != '.' && len--)
+		c = *name++;
+	if (c == '.') {
+		while (walk - res < 8)
+			*walk++ = ' ';
+		while (len > 0 && walk - res < MSDOS_NAME) {
+			c = *name++;
+			len--;
+			if (opts->name_check != 'r' && strchr(bad_chars, c))
+				return -EINVAL;
+			if (opts->name_check == 's' &&
+			    strchr(bad_if_strict, c))
+				return -EINVAL;
+			if (c < ' ' || c == ':' || c == '\\')
+				return -EINVAL;
+			if (c == '.') {
+				if (opts->name_check == 's')
+					return -EINVAL;
+				break;
+			}
+			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+				return -EINVAL;
+			space = c == ' ';
+			if (!opts->nocase && c >= 'a' && c <= 'z')
+				*walk++ = c - 32;
+			else
+				*walk++ = c;
+		}
+		if (space)
+			return -EINVAL;
+		if (opts->name_check == 's' && len)
+			return -EINVAL;
+	}
+	while (walk - res < MSDOS_NAME)
+		*walk++ = ' ';
+
+	return 0;
+}
+
+/***** Locates a directory entry.  Uses unformatted name. */
+static int msdos_find(struct inode *dir, const unsigned char *name, int len,
+		      struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	unsigned char msdos_name[MSDOS_NAME];
+	int err;
+
+	err = msdos_format_name(name, len, msdos_name, &sbi->options);
+	if (err)
+		return -ENOENT;
+
+	err = fat_scan(dir, msdos_name, sinfo);
+	if (!err && sbi->options.dotsOK) {
+		if (name[0] == '.') {
+			if (!(sinfo->de->attr & ATTR_HIDDEN))
+				err = -ENOENT;
+		} else {
+			if (sinfo->de->attr & ATTR_HIDDEN)
+				err = -ENOENT;
+		}
+		if (err)
+			brelse(sinfo->bh);
+	}
+	return err;
+}
+
+/*
+ * Compute the hash for the msdos name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The msdos fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
+	if (!error)
+		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+	return 0;
+}
+
+/*
+ * Compare two msdos names. If either of the names are invalid,
+ * we fall back to doing the standard name comparison.
+ */
+static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
+out:
+	return error;
+
+old_compare:
+	error = 1;
+	if (a->len == b->len)
+		error = memcmp(a->name, b->name, a->len);
+	goto out;
+}
+
+static struct dentry_operations msdos_dentry_operations = {
+	.d_hash		= msdos_hash,
+	.d_compare	= msdos_cmp,
+};
+
+/*
+ * AV. Wrappers for FAT sb operations. Is it wise?
+ */
+
+/***** Get inode using directory and name */
+static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode = NULL;
+	int res;
+
+	dentry->d_op = &msdos_dentry_operations;
+
+	lock_super(sb);
+	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (res == -ENOENT)
+		goto add;
+	if (res < 0)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		res = PTR_ERR(inode);
+		goto out;
+	}
+add:
+	res = 0;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry)
+		dentry->d_op = &msdos_dentry_operations;
+out:
+	unlock_super(sb);
+	if (!res)
+		return dentry;
+	return ERR_PTR(res);
+}
+
+/***** Creates a directory entry (name is already formatted). */
+static int msdos_add_entry(struct inode *dir, const unsigned char *name,
+			   int is_dir, int is_hid, int cluster,
+			   struct timespec *ts, struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct msdos_dir_entry de;
+	__le16 time, date;
+	int err;
+
+	memcpy(de.name, name, MSDOS_NAME);
+	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	if (is_hid)
+		de.attr |= ATTR_HIDDEN;
+	de.lcase = 0;
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	de.cdate = de.adate = 0;
+	de.ctime = 0;
+	de.ctime_cs = 0;
+	de.time = time;
+	de.date = date;
+	de.start = cpu_to_le16(cluster);
+	de.starthi = cpu_to_le16(cluster >> 16);
+	de.size = 0;
+
+	err = fat_add_entries(dir, &de, 1, sinfo);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/***** Create a file */
+static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	unsigned char msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* Have to do it due to foo vs. .foo conflicts */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+	return err;
+}
+
+/***** Remove a directory */
+static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = dentry->d_inode;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	/*
+	 * Check whether the directory is not in use, then check
+	 * whether it is empty.
+	 */
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+/***** Make a directory */
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	unsigned char msdos_name[MSDOS_NAME];
+	struct timespec ts;
+	int err, is_hid, cluster;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* foo vs .foo situation */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	fat_flush_inodes(sb, dir, inode);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+/***** Unlink a file */
+static int msdos_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb= inode->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
+			   struct dentry *old_dentry,
+			   struct inode *new_dir, unsigned char *new_name,
+			   struct dentry *new_dentry, int is_hid)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+
+	err = fat_scan(old_dir, old_name, &old_sinfo);
+	if (err) {
+		err = -EIO;
+		goto out;
+	}
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	old_attrs = MSDOS_I(old_inode)->i_attrs;
+	err = fat_scan(new_dir, new_name, &sinfo);
+	if (!err) {
+		if (!new_inode) {
+			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
+			if (sinfo.de != old_sinfo.de) {
+				err = -EINVAL;
+				goto out;
+			}
+			if (is_hid)
+				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+			else
+				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+			if (IS_DIRSYNC(old_dir)) {
+				err = fat_sync_inode(old_inode);
+				if (err) {
+					MSDOS_I(old_inode)->i_attrs = old_attrs;
+					goto out;
+				}
+			} else
+				mark_inode_dirty(old_inode);
+
+			old_dir->i_version++;
+			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+			if (IS_DIRSYNC(old_dir))
+				(void)fat_sync_inode(old_dir);
+			else
+				mark_inode_dirty(old_dir);
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (err)
+			goto out;
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
+				      &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (is_hid)
+		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+	else
+		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	MSDOS_I(old_inode)->i_attrs = old_attrs;
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_panic(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
+static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(old_dentry->d_name.name,
+				old_dentry->d_name.len, old_msdos_name,
+				&MSDOS_SB(old_dir->i_sb)->options);
+	if (err)
+		goto out;
+	err = msdos_format_name(new_dentry->d_name.name,
+				new_dentry->d_name.len, new_msdos_name,
+				&MSDOS_SB(new_dir->i_sb)->options);
+	if (err)
+		goto out;
+
+	is_hid =
+	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
+
+	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
+			      new_dir, new_msdos_name, new_dentry, is_hid);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, old_dir, new_dir);
+	return err;
+}
+
+static const struct inode_operations msdos_dir_inode_operations = {
+	.create		= msdos_create,
+	.lookup		= msdos_lookup,
+	.unlink		= msdos_unlink,
+	.mkdir		= msdos_mkdir,
+	.rmdir		= msdos_rmdir,
+	.rename		= msdos_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int res;
+
+	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
+	if (res)
+		return res;
+
+	sb->s_flags |= MS_NOATIME;
+	sb->s_root->d_op = &msdos_dentry_operations;
+	return 0;
+}
+
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
+}
+
+static struct file_system_type msdos_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "msdos",
+	.get_sb		= msdos_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_msdos_fs(void)
+{
+	return register_filesystem(&msdos_fs_type);
+}
+
+static void __exit exit_msdos_fs(void)
+{
+	unregister_filesystem(&msdos_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("MS-DOS filesystem support");
+
+module_init(init_msdos_fs)
+module_exit(exit_msdos_fs)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
new file mode 100644
index 00000000000..155c10b4adb
--- /dev/null
+++ b/fs/fat/namei_vfat.c
@@ -0,0 +1,1055 @@
+/*
+ *  linux/fs/vfat/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Windows95/Windows NT compatible extended MSDOS filesystem
+ *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
+ *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
+ *    what file operation caused you trouble and if you can duplicate
+ *    the problem, send a script that demonstrates it.
+ *
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ *
+ *  Support Multibyte characters and cleanup by
+ *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+ */
+
+#include <linux/module.h>
+
+#include <linux/jiffies.h>
+#include <linux/msdos_fs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int ret = 1;
+
+	if (!dentry->d_inode &&
+	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
+		/*
+		 * negative dentry is dropped, in order to make sure
+		 * to use the name which a user desires if this is
+		 * create path.
+		 */
+		ret = 0;
+	else {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
+			ret = 0;
+		spin_unlock(&dentry->d_lock);
+	}
+	return ret;
+}
+
+/* returns the length of a struct qstr, ignoring trailing dots */
+static unsigned int vfat_striptail_len(struct qstr *qstr)
+{
+	unsigned int len = qstr->len;
+
+	while (len && qstr->name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+{
+	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+	return 0;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = vfat_striptail_len(qstr);
+
+	hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(nls_tolower(t, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(a);
+	blen = vfat_striptail_len(b);
+	if (alen == blen) {
+		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Case sensitive compare of two vfat names.
+ */
+static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(a);
+	blen = vfat_striptail_len(b);
+	if (alen == blen) {
+		if (strncmp(a->name, b->name, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+static struct dentry_operations vfat_dentry_ops[4] = {
+	{
+		.d_hash		= vfat_hashi,
+		.d_compare	= vfat_cmpi,
+	},
+	{
+		.d_revalidate	= vfat_revalidate,
+		.d_hash		= vfat_hashi,
+		.d_compare	= vfat_cmpi,
+	},
+	{
+		.d_hash		= vfat_hash,
+		.d_compare	= vfat_cmp,
+	},
+	{
+		.d_revalidate	= vfat_revalidate,
+		.d_hash		= vfat_hash,
+		.d_compare	= vfat_cmp,
+	}
+};
+
+/* Characters that are undesirable in an MS-DOS file name */
+
+static inline wchar_t vfat_bad_char(wchar_t w)
+{
+	return (w < 0x0020)
+	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
+	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
+	    || (w == '\\');
+}
+
+static inline wchar_t vfat_replace_char(wchar_t w)
+{
+	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
+	    || (w == '+') || (w == '=');
+}
+
+static wchar_t vfat_skip_char(wchar_t w)
+{
+	return (w == '.') || (w == ' ');
+}
+
+static inline int vfat_is_used_badchars(const wchar_t *s, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (vfat_bad_char(s[i]))
+			return -EINVAL;
+
+	if (s[i - 1] == ' ') /* last character cannot be space */
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vfat_find_form(struct inode *dir, unsigned char *name)
+{
+	struct fat_slot_info sinfo;
+	int err = fat_scan(dir, name, &sinfo);
+	if (err)
+		return -ENOENT;
+	brelse(sinfo.bh);
+	return 0;
+}
+
+/*
+ * 1) Valid characters for the 8.3 format alias are any combination of
+ * letters, uppercase alphabets, digits, any of the
+ * following special characters:
+ *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
+ * In this case Longfilename is not stored in disk.
+ *
+ * WinNT's Extension:
+ * File name and extension name is contain uppercase/lowercase
+ * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
+ *
+ * 2) File name is 8.3 format, but it contain the uppercase and
+ * lowercase char, muliti bytes char, etc. In this case numtail is not
+ * added, but Longfilename is stored.
+ *
+ * 3) When the one except for the above, or the following special
+ * character are contained:
+ *        .   [ ] ; , + =
+ * numtail is added, and Longfilename must be stored in disk .
+ */
+struct shortname_info {
+	unsigned char lower:1,
+		      upper:1,
+		      valid:1;
+};
+#define INIT_SHORTNAME_INFO(x)	do {		\
+	(x)->lower = 1;				\
+	(x)->upper = 1;				\
+	(x)->valid = 1;				\
+} while (0)
+
+static inline int to_shortname_char(struct nls_table *nls,
+				    unsigned char *buf, int buf_size,
+				    wchar_t *src, struct shortname_info *info)
+{
+	int len;
+
+	if (vfat_skip_char(*src)) {
+		info->valid = 0;
+		return 0;
+	}
+	if (vfat_replace_char(*src)) {
+		info->valid = 0;
+		buf[0] = '_';
+		return 1;
+	}
+
+	len = nls->uni2char(*src, buf, buf_size);
+	if (len <= 0) {
+		info->valid = 0;
+		buf[0] = '_';
+		len = 1;
+	} else if (len == 1) {
+		unsigned char prev = buf[0];
+
+		if (buf[0] >= 0x7F) {
+			info->lower = 0;
+			info->upper = 0;
+		}
+
+		buf[0] = nls_toupper(nls, buf[0]);
+		if (isalpha(buf[0])) {
+			if (buf[0] == prev)
+				info->lower = 0;
+			else
+				info->upper = 0;
+		}
+	} else {
+		info->lower = 0;
+		info->upper = 0;
+	}
+
+	return len;
+}
+
+/*
+ * Given a valid longname, create a unique shortname.  Make sure the
+ * shortname does not exist
+ * Returns negative number on error, 0 for a normal
+ * return, and 1 for valid shortname
+ */
+static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
+				 wchar_t *uname, int ulen,
+				 unsigned char *name_res, unsigned char *lcase)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
+	wchar_t *ip, *ext_start, *end, *name_start;
+	unsigned char base[9], ext[4], buf[8], *p;
+	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
+	int chl, chi;
+	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
+	int is_shortname;
+	struct shortname_info base_info, ext_info;
+
+	is_shortname = 1;
+	INIT_SHORTNAME_INFO(&base_info);
+	INIT_SHORTNAME_INFO(&ext_info);
+
+	/* Now, we need to create a shortname from the long name */
+	ext_start = end = &uname[ulen];
+	while (--ext_start >= uname) {
+		if (*ext_start == 0x002E) {	/* is `.' */
+			if (ext_start == end - 1) {
+				sz = ulen;
+				ext_start = NULL;
+			}
+			break;
+		}
+	}
+
+	if (ext_start == uname - 1) {
+		sz = ulen;
+		ext_start = NULL;
+	} else if (ext_start) {
+		/*
+		 * Names which start with a dot could be just
+		 * an extension eg. "...test".  In this case Win95
+		 * uses the extension as the name and sets no extension.
+		 */
+		name_start = &uname[0];
+		while (name_start < ext_start) {
+			if (!vfat_skip_char(*name_start))
+				break;
+			name_start++;
+		}
+		if (name_start != ext_start) {
+			sz = ext_start - uname;
+			ext_start++;
+		} else {
+			sz = ulen;
+			ext_start = NULL;
+		}
+	}
+
+	numtail_baselen = 6;
+	numtail2_baselen = 2;
+	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
+		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+					ip, &base_info);
+		if (chl == 0)
+			continue;
+
+		if (baselen < 2 && (baselen + chl) > 2)
+			numtail2_baselen = baselen;
+		if (baselen < 6 && (baselen + chl) > 6)
+			numtail_baselen = baselen;
+		for (chi = 0; chi < chl; chi++) {
+			*p++ = charbuf[chi];
+			baselen++;
+			if (baselen >= 8)
+				break;
+		}
+		if (baselen >= 8) {
+			if ((chi < chl - 1) || (ip + 1) - uname < sz)
+				is_shortname = 0;
+			break;
+		}
+	}
+	if (baselen == 0) {
+		return -EINVAL;
+	}
+
+	extlen = 0;
+	if (ext_start) {
+		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
+			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+						ip, &ext_info);
+			if (chl == 0)
+				continue;
+
+			if ((extlen + chl) > 3) {
+				is_shortname = 0;
+				break;
+			}
+			for (chi = 0; chi < chl; chi++) {
+				*p++ = charbuf[chi];
+				extlen++;
+			}
+			if (extlen >= 3) {
+				if (ip + 1 != end)
+					is_shortname = 0;
+				break;
+			}
+		}
+	}
+	ext[extlen] = '\0';
+	base[baselen] = '\0';
+
+	/* Yes, it can happen. ".\xe5" would do it. */
+	if (base[0] == DELETED_FLAG)
+		base[0] = 0x05;
+
+	/* OK, at this point we know that base is not longer than 8 symbols,
+	 * ext is not longer than 3, base is nonempty, both don't contain
+	 * any bad symbols (lowercase transformed to uppercase).
+	 */
+
+	memset(name_res, ' ', MSDOS_NAME);
+	memcpy(name_res, base, baselen);
+	memcpy(name_res + 8, ext, extlen);
+	*lcase = 0;
+	if (is_shortname && base_info.valid && ext_info.valid) {
+		if (vfat_find_form(dir, name_res) == 0)
+			return -EEXIST;
+
+		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
+			return (base_info.upper && ext_info.upper);
+		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
+			if ((base_info.upper || base_info.lower) &&
+			    (ext_info.upper || ext_info.lower)) {
+				if (!base_info.upper && base_info.lower)
+					*lcase |= CASE_LOWER_BASE;
+				if (!ext_info.upper && ext_info.lower)
+					*lcase |= CASE_LOWER_EXT;
+				return 1;
+			}
+			return 0;
+		} else {
+			BUG();
+		}
+	}
+
+	if (opts->numtail == 0)
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+
+	/*
+	 * Try to find a unique extension.  This used to
+	 * iterate through all possibilities sequentially,
+	 * but that gave extremely bad performance.  Windows
+	 * only tries a few cases before using random
+	 * values for part of the base.
+	 */
+
+	if (baselen > 6) {
+		baselen = numtail_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen] = '~';
+	for (i = 1; i < 10; i++) {
+		name_res[baselen + 1] = i + '0';
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+	}
+
+	i = jiffies & 0xffff;
+	sz = (jiffies >> 16) & 0x7;
+	if (baselen > 2) {
+		baselen = numtail2_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen + 4] = '~';
+	name_res[baselen + 5] = '1' + sz;
+	while (1) {
+		sprintf(buf, "%04X", i);
+		memcpy(&name_res[baselen], buf, 4);
+		if (vfat_find_form(dir, name_res) < 0)
+			break;
+		i -= 11;
+	}
+	return 0;
+}
+
+/* Translate a string, including coded sequences into Unicode */
+static int
+xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
+	     int *longlen, int *outlen, int escape, int utf8,
+	     struct nls_table *nls)
+{
+	const unsigned char *ip;
+	unsigned char nc;
+	unsigned char *op;
+	unsigned int ec;
+	int i, k, fill;
+	int charlen;
+
+	if (utf8) {
+		int name_len = strlen(name);
+
+		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+
+		/*
+		 * We stripped '.'s before and set len appropriately,
+		 * but utf8_mbstowcs doesn't care about len
+		 */
+		*outlen -= (name_len - len);
+
+		if (*outlen > 255)
+			return -ENAMETOOLONG;
+
+		op = &outname[*outlen * sizeof(wchar_t)];
+	} else {
+		if (nls) {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= 255;
+			     *outlen += 1)
+			{
+				if (escape && (*ip == ':')) {
+					if (i > len - 5)
+						return -EINVAL;
+					ec = 0;
+					for (k = 1; k < 5; k++) {
+						nc = ip[k];
+						ec <<= 4;
+						if (nc >= '0' && nc <= '9') {
+							ec |= nc - '0';
+							continue;
+						}
+						if (nc >= 'a' && nc <= 'f') {
+							ec |= nc - ('a' - 10);
+							continue;
+						}
+						if (nc >= 'A' && nc <= 'F') {
+							ec |= nc - ('A' - 10);
+							continue;
+						}
+						return -EINVAL;
+					}
+					*op++ = ec & 0xFF;
+					*op++ = ec >> 8;
+					ip += 5;
+					i += 5;
+				} else {
+					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
+						return -EINVAL;
+					ip += charlen;
+					i += charlen;
+					op += 2;
+				}
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		} else {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= 255;
+			     i++, *outlen += 1)
+			{
+				*op++ = *ip++;
+				*op++ = 0;
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		}
+	}
+
+	*longlen = *outlen;
+	if (*outlen % 13) {
+		*op++ = 0;
+		*op++ = 0;
+		*outlen += 1;
+		if (*outlen % 13) {
+			fill = 13 - (*outlen % 13);
+			for (i = 0; i < fill; i++) {
+				*op++ = 0xff;
+				*op++ = 0xff;
+			}
+			*outlen += fill;
+		}
+	}
+
+	return 0;
+}
+
+static int vfat_build_slots(struct inode *dir, const unsigned char *name,
+			    int len, int is_dir, int cluster,
+			    struct timespec *ts,
+			    struct msdos_dir_slot *slots, int *nr_slots)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	struct msdos_dir_slot *ps;
+	struct msdos_dir_entry *de;
+	unsigned char cksum, lcase;
+	unsigned char msdos_name[MSDOS_NAME];
+	wchar_t *uname;
+	__le16 time, date;
+	int err, ulen, usize, i;
+	loff_t offset;
+
+	*nr_slots = 0;
+
+	uname = __getname();
+	if (!uname)
+		return -ENOMEM;
+
+	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
+			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
+	if (err)
+		goto out_free;
+
+	err = vfat_is_used_badchars(uname, ulen);
+	if (err)
+		goto out_free;
+
+	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
+				    msdos_name, &lcase);
+	if (err < 0)
+		goto out_free;
+	else if (err == 1) {
+		de = (struct msdos_dir_entry *)slots;
+		err = 0;
+		goto shortname;
+	}
+
+	/* build the entry of long file name */
+	cksum = fat_checksum(msdos_name);
+
+	*nr_slots = usize / 13;
+	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
+		ps->id = i;
+		ps->attr = ATTR_EXT;
+		ps->reserved = 0;
+		ps->alias_checksum = cksum;
+		ps->start = 0;
+		offset = (i - 1) * 13;
+		fatwchar_to16(ps->name0_4, uname + offset, 5);
+		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
+		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
+	}
+	slots[0].id |= 0x40;
+	de = (struct msdos_dir_entry *)ps;
+
+shortname:
+	/* build the entry of 8.3 alias name */
+	(*nr_slots)++;
+	memcpy(de->name, msdos_name, MSDOS_NAME);
+	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	de->lcase = lcase;
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	de->time = de->ctime = time;
+	de->date = de->cdate = de->adate = date;
+	de->ctime_cs = 0;
+	de->start = cpu_to_le16(cluster);
+	de->starthi = cpu_to_le16(cluster >> 16);
+	de->size = 0;
+out_free:
+	__putname(uname);
+	return err;
+}
+
+static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
+			  int cluster, struct timespec *ts,
+			  struct fat_slot_info *sinfo)
+{
+	struct msdos_dir_slot *slots;
+	unsigned int len;
+	int err, nr_slots;
+
+	len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
+	if (slots == NULL)
+		return -ENOMEM;
+
+	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
+			       slots, &nr_slots);
+	if (err)
+		goto cleanup;
+
+	err = fat_add_entries(dir, slots, nr_slots, sinfo);
+	if (err)
+		goto cleanup;
+
+	/* update timestamp */
+	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+cleanup:
+	kfree(slots);
+	return err;
+}
+
+static int vfat_find(struct inode *dir, struct qstr *qname,
+		     struct fat_slot_info *sinfo)
+{
+	unsigned int len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+	return fat_search_long(dir, qname->name, len, sinfo);
+}
+
+static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode = NULL;
+	struct dentry *alias;
+	int err, table;
+
+	lock_super(sb);
+	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
+	dentry->d_op = &vfat_dentry_ops[table];
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err) {
+		table++;
+		goto error;
+	}
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		unlock_super(sb);
+		return ERR_CAST(inode);
+	}
+	alias = d_find_alias(inode);
+	if (alias) {
+		if (d_invalidate(alias) == 0)
+			dput(alias);
+		else {
+			iput(inode);
+			unlock_super(sb);
+			return alias;
+		}
+
+	}
+error:
+	unlock_super(sb);
+	dentry->d_op = &vfat_dentry_ops[table];
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry) {
+		dentry->d_op = &vfat_dentry_ops[table];
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+	}
+	return dentry;
+}
+
+static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	dir->i_version++;
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err, cluster;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	dir->i_version++;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+	lock_super(sb);
+	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
+	if (err)
+		goto out;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
+				     &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+ 			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	unlock_super(sb);
+
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_panic(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+static const struct inode_operations vfat_dir_inode_operations = {
+	.create		= vfat_create,
+	.lookup		= vfat_lookup,
+	.unlink		= vfat_unlink,
+	.mkdir		= vfat_mkdir,
+	.rmdir		= vfat_rmdir,
+	.rename		= vfat_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int res;
+
+	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
+	if (res)
+		return res;
+
+	if (MSDOS_SB(sb)->options.name_check != 's')
+		sb->s_root->d_op = &vfat_dentry_ops[0];
+	else
+		sb->s_root->d_op = &vfat_dentry_ops[2];
+
+	return 0;
+}
+
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
+}
+
+static struct file_system_type vfat_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vfat",
+	.get_sb		= vfat_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_vfat_fs(void)
+{
+	return register_filesystem(&vfat_fs_type);
+}
+
+static void __exit exit_vfat_fs(void)
+{
+	unregister_filesystem(&vfat_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VFAT filesystem support");
+MODULE_AUTHOR("Gordon Chaffee");
+
+module_init(init_vfat_fs)
+module_exit(exit_vfat_fs)
diff --git a/fs/msdos/Makefile b/fs/msdos/Makefile
deleted file mode 100644
index ea67646fcb9..00000000000
--- a/fs/msdos/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the Linux msdos filesystem routines.
-#
-
-obj-$(CONFIG_MSDOS_FS) += msdos.o
-
-msdos-y := namei.o
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
deleted file mode 100644
index e844b9809d2..00000000000
--- a/fs/msdos/namei.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  linux/fs/msdos/namei.c
- *
- *  Written 1992,1993 by Werner Almesberger
- *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
- *  Rewritten for constant inumbers 1999 by Al Viro
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
-
-/* Characters that are undesirable in an MS-DOS file name */
-static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict[] = "+=,; ";
-
-/***** Formats an MS-DOS file name. Rejects invalid names. */
-static int msdos_format_name(const unsigned char *name, int len,
-			     unsigned char *res, struct fat_mount_options *opts)
-	/*
-	 * name is the proposed name, len is its length, res is
-	 * the resulting name, opts->name_check is either (r)elaxed,
-	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
-	 * beginning of name (for hidden files)
-	 */
-{
-	unsigned char *walk;
-	unsigned char c;
-	int space;
-
-	if (name[0] == '.') {	/* dotfile because . and .. already done */
-		if (opts->dotsOK) {
-			/* Get rid of dot - test for it elsewhere */
-			name++;
-			len--;
-		} else
-			return -EINVAL;
-	}
-	/*
-	 * disallow names that _really_ start with a dot
-	 */
-	space = 1;
-	c = 0;
-	for (walk = res; len && walk - res < 8; walk++) {
-		c = *name++;
-		len--;
-		if (opts->name_check != 'r' && strchr(bad_chars, c))
-			return -EINVAL;
-		if (opts->name_check == 's' && strchr(bad_if_strict, c))
-			return -EINVAL;
-		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
-			return -EINVAL;
-		if (c < ' ' || c == ':' || c == '\\')
-			return -EINVAL;
-	/*
-	 * 0xE5 is legal as a first character, but we must substitute
-	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
-	 * does this.
-	 * It seems that Microsoft hacked DOS to support non-US
-	 * characters after the 0xE5 character was already in use to
-	 * mark deleted files.
-	 */
-		if ((res == walk) && (c == 0xE5))
-			c = 0x05;
-		if (c == '.')
-			break;
-		space = (c == ' ');
-		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
-	}
-	if (space)
-		return -EINVAL;
-	if (opts->name_check == 's' && len && c != '.') {
-		c = *name++;
-		len--;
-		if (c != '.')
-			return -EINVAL;
-	}
-	while (c != '.' && len--)
-		c = *name++;
-	if (c == '.') {
-		while (walk - res < 8)
-			*walk++ = ' ';
-		while (len > 0 && walk - res < MSDOS_NAME) {
-			c = *name++;
-			len--;
-			if (opts->name_check != 'r' && strchr(bad_chars, c))
-				return -EINVAL;
-			if (opts->name_check == 's' &&
-			    strchr(bad_if_strict, c))
-				return -EINVAL;
-			if (c < ' ' || c == ':' || c == '\\')
-				return -EINVAL;
-			if (c == '.') {
-				if (opts->name_check == 's')
-					return -EINVAL;
-				break;
-			}
-			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
-				return -EINVAL;
-			space = c == ' ';
-			if (!opts->nocase && c >= 'a' && c <= 'z')
-				*walk++ = c - 32;
-			else
-				*walk++ = c;
-		}
-		if (space)
-			return -EINVAL;
-		if (opts->name_check == 's' && len)
-			return -EINVAL;
-	}
-	while (walk - res < MSDOS_NAME)
-		*walk++ = ' ';
-
-	return 0;
-}
-
-/***** Locates a directory entry.  Uses unformatted name. */
-static int msdos_find(struct inode *dir, const unsigned char *name, int len,
-		      struct fat_slot_info *sinfo)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	unsigned char msdos_name[MSDOS_NAME];
-	int err;
-
-	err = msdos_format_name(name, len, msdos_name, &sbi->options);
-	if (err)
-		return -ENOENT;
-
-	err = fat_scan(dir, msdos_name, sinfo);
-	if (!err && sbi->options.dotsOK) {
-		if (name[0] == '.') {
-			if (!(sinfo->de->attr & ATTR_HIDDEN))
-				err = -ENOENT;
-		} else {
-			if (sinfo->de->attr & ATTR_HIDDEN)
-				err = -ENOENT;
-		}
-		if (err)
-			brelse(sinfo->bh);
-	}
-	return err;
-}
-
-/*
- * Compute the hash for the msdos name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The msdos fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
-	unsigned char msdos_name[MSDOS_NAME];
-	int error;
-
-	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
-	if (!error)
-		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
-	return 0;
-}
-
-/*
- * Compare two msdos names. If either of the names are invalid,
- * we fall back to doing the standard name comparison.
- */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
-	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
-	int error;
-
-	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
-	if (error)
-		goto old_compare;
-	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
-	if (error)
-		goto old_compare;
-	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
-out:
-	return error;
-
-old_compare:
-	error = 1;
-	if (a->len == b->len)
-		error = memcmp(a->name, b->name, a->len);
-	goto out;
-}
-
-static struct dentry_operations msdos_dentry_operations = {
-	.d_hash		= msdos_hash,
-	.d_compare	= msdos_cmp,
-};
-
-/*
- * AV. Wrappers for FAT sb operations. Is it wise?
- */
-
-/***** Get inode using directory and name */
-static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	int res;
-
-	dentry->d_op = &msdos_dentry_operations;
-
-	lock_super(sb);
-	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (res == -ENOENT)
-		goto add;
-	if (res < 0)
-		goto out;
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto out;
-	}
-add:
-	res = 0;
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry)
-		dentry->d_op = &msdos_dentry_operations;
-out:
-	unlock_super(sb);
-	if (!res)
-		return dentry;
-	return ERR_PTR(res);
-}
-
-/***** Creates a directory entry (name is already formatted). */
-static int msdos_add_entry(struct inode *dir, const unsigned char *name,
-			   int is_dir, int is_hid, int cluster,
-			   struct timespec *ts, struct fat_slot_info *sinfo)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	struct msdos_dir_entry de;
-	__le16 time, date;
-	int err;
-
-	memcpy(de.name, name, MSDOS_NAME);
-	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
-	if (is_hid)
-		de.attr |= ATTR_HIDDEN;
-	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
-	de.cdate = de.adate = 0;
-	de.ctime = 0;
-	de.ctime_cs = 0;
-	de.time = time;
-	de.date = date;
-	de.start = cpu_to_le16(cluster);
-	de.starthi = cpu_to_le16(cluster >> 16);
-	de.size = 0;
-
-	err = fat_add_entries(dir, &de, 1, sinfo);
-	if (err)
-		return err;
-
-	dir->i_ctime = dir->i_mtime = *ts;
-	if (IS_DIRSYNC(dir))
-		(void)fat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
-
-	return 0;
-}
-
-/***** Create a file */
-static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
-			struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = NULL;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	unsigned char msdos_name[MSDOS_NAME];
-	int err, is_hid;
-
-	lock_super(sb);
-
-	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
-				msdos_name, &MSDOS_SB(sb)->options);
-	if (err)
-		goto out;
-	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
-	/* Have to do it due to foo vs. .foo conflicts */
-	if (!fat_scan(dir, msdos_name, &sinfo)) {
-		brelse(sinfo.bh);
-		err = -EINVAL;
-		goto out;
-	}
-
-	ts = CURRENT_TIME_SEC;
-	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
-	if (err)
-		goto out;
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out;
-	}
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	d_instantiate(dentry, inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-	return err;
-}
-
-/***** Remove a directory */
-static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = dentry->d_inode;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-	/*
-	 * Check whether the directory is not in use, then check
-	 * whether it is empty.
-	 */
-	err = fat_dir_empty(inode);
-	if (err)
-		goto out;
-	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	drop_nlink(dir);
-
-	clear_nlink(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-
-	return err;
-}
-
-/***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode;
-	unsigned char msdos_name[MSDOS_NAME];
-	struct timespec ts;
-	int err, is_hid, cluster;
-
-	lock_super(sb);
-
-	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
-				msdos_name, &MSDOS_SB(sb)->options);
-	if (err)
-		goto out;
-	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
-	/* foo vs .foo situation */
-	if (!fat_scan(dir, msdos_name, &sinfo)) {
-		brelse(sinfo.bh);
-		err = -EINVAL;
-		goto out;
-	}
-
-	ts = CURRENT_TIME_SEC;
-	cluster = fat_alloc_new_dir(dir, &ts);
-	if (cluster < 0) {
-		err = cluster;
-		goto out;
-	}
-	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
-	if (err)
-		goto out_free;
-	inc_nlink(dir);
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		/* the directory was completed, just return a error */
-		goto out;
-	}
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	d_instantiate(dentry, inode);
-
-	unlock_super(sb);
-	fat_flush_inodes(sb, dir, inode);
-	return 0;
-
-out_free:
-	fat_free_clusters(dir, cluster);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-/***** Unlink a file */
-static int msdos_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb= inode->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	clear_nlink(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-
-	return err;
-}
-
-static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
-			   struct dentry *old_dentry,
-			   struct inode *new_dir, unsigned char *new_name,
-			   struct dentry *new_dentry, int is_hid)
-{
-	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
-	struct inode *old_inode, *new_inode;
-	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
-	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
-
-	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
-	old_inode = old_dentry->d_inode;
-	new_inode = new_dentry->d_inode;
-
-	err = fat_scan(old_dir, old_name, &old_sinfo);
-	if (err) {
-		err = -EIO;
-		goto out;
-	}
-
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	old_attrs = MSDOS_I(old_inode)->i_attrs;
-	err = fat_scan(new_dir, new_name, &sinfo);
-	if (!err) {
-		if (!new_inode) {
-			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
-			if (sinfo.de != old_sinfo.de) {
-				err = -EINVAL;
-				goto out;
-			}
-			if (is_hid)
-				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
-			else
-				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
-			if (IS_DIRSYNC(old_dir)) {
-				err = fat_sync_inode(old_inode);
-				if (err) {
-					MSDOS_I(old_inode)->i_attrs = old_attrs;
-					goto out;
-				}
-			} else
-				mark_inode_dirty(old_inode);
-
-			old_dir->i_version++;
-			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-			if (IS_DIRSYNC(old_dir))
-				(void)fat_sync_inode(old_dir);
-			else
-				mark_inode_dirty(old_dir);
-			goto out;
-		}
-	}
-
-	ts = CURRENT_TIME_SEC;
-	if (new_inode) {
-		if (err)
-			goto out;
-		if (is_dir) {
-			err = fat_dir_empty(new_inode);
-			if (err)
-				goto out;
-		}
-		new_i_pos = MSDOS_I(new_inode)->i_pos;
-		fat_detach(new_inode);
-	} else {
-		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
-				      &ts, &sinfo);
-		if (err)
-			goto out;
-		new_i_pos = sinfo.i_pos;
-	}
-	new_dir->i_version++;
-
-	fat_detach(old_inode);
-	fat_attach(old_inode, new_i_pos);
-	if (is_hid)
-		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
-	else
-		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
-
-	if (update_dotdot) {
-		int start = MSDOS_I(new_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
-		drop_nlink(old_dir);
-		if (!new_inode)
-			inc_nlink(new_dir);
-	}
-
-	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
-	old_sinfo.bh = NULL;
-	if (err)
-		goto error_dotdot;
-	old_dir->i_version++;
-	old_dir->i_ctime = old_dir->i_mtime = ts;
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		if (is_dir)
-			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
-	}
-out:
-	brelse(sinfo.bh);
-	brelse(dotdot_bh);
-	brelse(old_sinfo.bh);
-	return err;
-
-error_dotdot:
-	/* data cluster is shared, serious corruption */
-	corrupt = 1;
-
-	if (update_dotdot) {
-		int start = MSDOS_I(old_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
-	}
-error_inode:
-	fat_detach(old_inode);
-	fat_attach(old_inode, old_sinfo.i_pos);
-	MSDOS_I(old_inode)->i_attrs = old_attrs;
-	if (new_inode) {
-		fat_attach(new_inode, new_i_pos);
-		if (corrupt)
-			corrupt |= fat_sync_inode(new_inode);
-	} else {
-		/*
-		 * If new entry was not sharing the data cluster, it
-		 * shouldn't be serious corruption.
-		 */
-		int err2 = fat_remove_entries(new_dir, &sinfo);
-		if (corrupt)
-			corrupt |= err2;
-		sinfo.bh = NULL;
-	}
-	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
-			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __func__, sinfo.i_pos);
-	}
-	goto out;
-}
-
-/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
-static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct super_block *sb = old_dir->i_sb;
-	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
-	int err, is_hid;
-
-	lock_super(sb);
-
-	err = msdos_format_name(old_dentry->d_name.name,
-				old_dentry->d_name.len, old_msdos_name,
-				&MSDOS_SB(old_dir->i_sb)->options);
-	if (err)
-		goto out;
-	err = msdos_format_name(new_dentry->d_name.name,
-				new_dentry->d_name.len, new_msdos_name,
-				&MSDOS_SB(new_dir->i_sb)->options);
-	if (err)
-		goto out;
-
-	is_hid =
-	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
-
-	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
-			      new_dir, new_msdos_name, new_dentry, is_hid);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, old_dir, new_dir);
-	return err;
-}
-
-static const struct inode_operations msdos_dir_inode_operations = {
-	.create		= msdos_create,
-	.lookup		= msdos_lookup,
-	.unlink		= msdos_unlink,
-	.mkdir		= msdos_mkdir,
-	.rmdir		= msdos_rmdir,
-	.rename		= msdos_rename,
-	.setattr	= fat_setattr,
-	.getattr	= fat_getattr,
-};
-
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
-{
-	int res;
-
-	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-	if (res)
-		return res;
-
-	sb->s_flags |= MS_NOATIME;
-	sb->s_root->d_op = &msdos_dentry_operations;
-	return 0;
-}
-
-static int msdos_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
-			   mnt);
-}
-
-static struct file_system_type msdos_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "msdos",
-	.get_sb		= msdos_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_msdos_fs(void)
-{
-	return register_filesystem(&msdos_fs_type);
-}
-
-static void __exit exit_msdos_fs(void)
-{
-	unregister_filesystem(&msdos_fs_type);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Werner Almesberger");
-MODULE_DESCRIPTION("MS-DOS filesystem support");
-
-module_init(init_msdos_fs)
-module_exit(exit_msdos_fs)
diff --git a/fs/vfat/Makefile b/fs/vfat/Makefile
deleted file mode 100644
index 40f2798a4f0..00000000000
--- a/fs/vfat/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux vfat-filesystem routines.
-#
-
-obj-$(CONFIG_VFAT_FS) += vfat.o
-
-vfat-y := namei.o
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
deleted file mode 100644
index 155c10b4adb..00000000000
--- a/fs/vfat/namei.c
+++ /dev/null
@@ -1,1055 +0,0 @@
-/*
- *  linux/fs/vfat/namei.c
- *
- *  Written 1992,1993 by Werner Almesberger
- *
- *  Windows95/Windows NT compatible extended MSDOS filesystem
- *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
- *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
- *    what file operation caused you trouble and if you can duplicate
- *    the problem, send a script that demonstrates it.
- *
- *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
- *
- *  Support Multibyte characters and cleanup by
- *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
- */
-
-#include <linux/module.h>
-
-#include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
-#include <linux/ctype.h>
-#include <linux/slab.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/namei.h>
-
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	int ret = 1;
-
-	if (!dentry->d_inode &&
-	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-		/*
-		 * negative dentry is dropped, in order to make sure
-		 * to use the name which a user desires if this is
-		 * create path.
-		 */
-		ret = 0;
-	else {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-			ret = 0;
-		spin_unlock(&dentry->d_lock);
-	}
-	return ret;
-}
-
-/* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
-{
-	unsigned int len = qstr->len;
-
-	while (len && qstr->name[len - 1] == '.')
-		len--;
-	return len;
-}
-
-/*
- * Compute the hash for the vfat name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The vfat fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
-	return 0;
-}
-
-/*
- * Compute the hash for the vfat name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The vfat fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
-{
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
-	const unsigned char *name;
-	unsigned int len;
-	unsigned long hash;
-
-	name = qstr->name;
-	len = vfat_striptail_len(qstr);
-
-	hash = init_name_hash();
-	while (len--)
-		hash = partial_name_hash(nls_tolower(t, *name++), hash);
-	qstr->hash = end_name_hash(hash);
-
-	return 0;
-}
-
-/*
- * Case insensitive compare of two vfat names.
- */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
-	unsigned int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
-	if (alen == blen) {
-		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-/*
- * Case sensitive compare of two vfat names.
- */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	unsigned int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
-	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-static struct dentry_operations vfat_dentry_ops[4] = {
-	{
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	}
-};
-
-/* Characters that are undesirable in an MS-DOS file name */
-
-static inline wchar_t vfat_bad_char(wchar_t w)
-{
-	return (w < 0x0020)
-	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
-	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
-	    || (w == '\\');
-}
-
-static inline wchar_t vfat_replace_char(wchar_t w)
-{
-	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
-	    || (w == '+') || (w == '=');
-}
-
-static wchar_t vfat_skip_char(wchar_t w)
-{
-	return (w == '.') || (w == ' ');
-}
-
-static inline int vfat_is_used_badchars(const wchar_t *s, int len)
-{
-	int i;
-
-	for (i = 0; i < len; i++)
-		if (vfat_bad_char(s[i]))
-			return -EINVAL;
-
-	if (s[i - 1] == ' ') /* last character cannot be space */
-		return -EINVAL;
-
-	return 0;
-}
-
-static int vfat_find_form(struct inode *dir, unsigned char *name)
-{
-	struct fat_slot_info sinfo;
-	int err = fat_scan(dir, name, &sinfo);
-	if (err)
-		return -ENOENT;
-	brelse(sinfo.bh);
-	return 0;
-}
-
-/*
- * 1) Valid characters for the 8.3 format alias are any combination of
- * letters, uppercase alphabets, digits, any of the
- * following special characters:
- *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
- * In this case Longfilename is not stored in disk.
- *
- * WinNT's Extension:
- * File name and extension name is contain uppercase/lowercase
- * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
- *
- * 2) File name is 8.3 format, but it contain the uppercase and
- * lowercase char, muliti bytes char, etc. In this case numtail is not
- * added, but Longfilename is stored.
- *
- * 3) When the one except for the above, or the following special
- * character are contained:
- *        .   [ ] ; , + =
- * numtail is added, and Longfilename must be stored in disk .
- */
-struct shortname_info {
-	unsigned char lower:1,
-		      upper:1,
-		      valid:1;
-};
-#define INIT_SHORTNAME_INFO(x)	do {		\
-	(x)->lower = 1;				\
-	(x)->upper = 1;				\
-	(x)->valid = 1;				\
-} while (0)
-
-static inline int to_shortname_char(struct nls_table *nls,
-				    unsigned char *buf, int buf_size,
-				    wchar_t *src, struct shortname_info *info)
-{
-	int len;
-
-	if (vfat_skip_char(*src)) {
-		info->valid = 0;
-		return 0;
-	}
-	if (vfat_replace_char(*src)) {
-		info->valid = 0;
-		buf[0] = '_';
-		return 1;
-	}
-
-	len = nls->uni2char(*src, buf, buf_size);
-	if (len <= 0) {
-		info->valid = 0;
-		buf[0] = '_';
-		len = 1;
-	} else if (len == 1) {
-		unsigned char prev = buf[0];
-
-		if (buf[0] >= 0x7F) {
-			info->lower = 0;
-			info->upper = 0;
-		}
-
-		buf[0] = nls_toupper(nls, buf[0]);
-		if (isalpha(buf[0])) {
-			if (buf[0] == prev)
-				info->lower = 0;
-			else
-				info->upper = 0;
-		}
-	} else {
-		info->lower = 0;
-		info->upper = 0;
-	}
-
-	return len;
-}
-
-/*
- * Given a valid longname, create a unique shortname.  Make sure the
- * shortname does not exist
- * Returns negative number on error, 0 for a normal
- * return, and 1 for valid shortname
- */
-static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
-				 wchar_t *uname, int ulen,
-				 unsigned char *name_res, unsigned char *lcase)
-{
-	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
-	wchar_t *ip, *ext_start, *end, *name_start;
-	unsigned char base[9], ext[4], buf[8], *p;
-	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
-	int chl, chi;
-	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
-	int is_shortname;
-	struct shortname_info base_info, ext_info;
-
-	is_shortname = 1;
-	INIT_SHORTNAME_INFO(&base_info);
-	INIT_SHORTNAME_INFO(&ext_info);
-
-	/* Now, we need to create a shortname from the long name */
-	ext_start = end = &uname[ulen];
-	while (--ext_start >= uname) {
-		if (*ext_start == 0x002E) {	/* is `.' */
-			if (ext_start == end - 1) {
-				sz = ulen;
-				ext_start = NULL;
-			}
-			break;
-		}
-	}
-
-	if (ext_start == uname - 1) {
-		sz = ulen;
-		ext_start = NULL;
-	} else if (ext_start) {
-		/*
-		 * Names which start with a dot could be just
-		 * an extension eg. "...test".  In this case Win95
-		 * uses the extension as the name and sets no extension.
-		 */
-		name_start = &uname[0];
-		while (name_start < ext_start) {
-			if (!vfat_skip_char(*name_start))
-				break;
-			name_start++;
-		}
-		if (name_start != ext_start) {
-			sz = ext_start - uname;
-			ext_start++;
-		} else {
-			sz = ulen;
-			ext_start = NULL;
-		}
-	}
-
-	numtail_baselen = 6;
-	numtail2_baselen = 2;
-	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
-		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
-					ip, &base_info);
-		if (chl == 0)
-			continue;
-
-		if (baselen < 2 && (baselen + chl) > 2)
-			numtail2_baselen = baselen;
-		if (baselen < 6 && (baselen + chl) > 6)
-			numtail_baselen = baselen;
-		for (chi = 0; chi < chl; chi++) {
-			*p++ = charbuf[chi];
-			baselen++;
-			if (baselen >= 8)
-				break;
-		}
-		if (baselen >= 8) {
-			if ((chi < chl - 1) || (ip + 1) - uname < sz)
-				is_shortname = 0;
-			break;
-		}
-	}
-	if (baselen == 0) {
-		return -EINVAL;
-	}
-
-	extlen = 0;
-	if (ext_start) {
-		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
-			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
-						ip, &ext_info);
-			if (chl == 0)
-				continue;
-
-			if ((extlen + chl) > 3) {
-				is_shortname = 0;
-				break;
-			}
-			for (chi = 0; chi < chl; chi++) {
-				*p++ = charbuf[chi];
-				extlen++;
-			}
-			if (extlen >= 3) {
-				if (ip + 1 != end)
-					is_shortname = 0;
-				break;
-			}
-		}
-	}
-	ext[extlen] = '\0';
-	base[baselen] = '\0';
-
-	/* Yes, it can happen. ".\xe5" would do it. */
-	if (base[0] == DELETED_FLAG)
-		base[0] = 0x05;
-
-	/* OK, at this point we know that base is not longer than 8 symbols,
-	 * ext is not longer than 3, base is nonempty, both don't contain
-	 * any bad symbols (lowercase transformed to uppercase).
-	 */
-
-	memset(name_res, ' ', MSDOS_NAME);
-	memcpy(name_res, base, baselen);
-	memcpy(name_res + 8, ext, extlen);
-	*lcase = 0;
-	if (is_shortname && base_info.valid && ext_info.valid) {
-		if (vfat_find_form(dir, name_res) == 0)
-			return -EEXIST;
-
-		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
-			return (base_info.upper && ext_info.upper);
-		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
-			if ((base_info.upper || base_info.lower) &&
-			    (ext_info.upper || ext_info.lower)) {
-				if (!base_info.upper && base_info.lower)
-					*lcase |= CASE_LOWER_BASE;
-				if (!ext_info.upper && ext_info.lower)
-					*lcase |= CASE_LOWER_EXT;
-				return 1;
-			}
-			return 0;
-		} else {
-			BUG();
-		}
-	}
-
-	if (opts->numtail == 0)
-		if (vfat_find_form(dir, name_res) < 0)
-			return 0;
-
-	/*
-	 * Try to find a unique extension.  This used to
-	 * iterate through all possibilities sequentially,
-	 * but that gave extremely bad performance.  Windows
-	 * only tries a few cases before using random
-	 * values for part of the base.
-	 */
-
-	if (baselen > 6) {
-		baselen = numtail_baselen;
-		name_res[7] = ' ';
-	}
-	name_res[baselen] = '~';
-	for (i = 1; i < 10; i++) {
-		name_res[baselen + 1] = i + '0';
-		if (vfat_find_form(dir, name_res) < 0)
-			return 0;
-	}
-
-	i = jiffies & 0xffff;
-	sz = (jiffies >> 16) & 0x7;
-	if (baselen > 2) {
-		baselen = numtail2_baselen;
-		name_res[7] = ' ';
-	}
-	name_res[baselen + 4] = '~';
-	name_res[baselen + 5] = '1' + sz;
-	while (1) {
-		sprintf(buf, "%04X", i);
-		memcpy(&name_res[baselen], buf, 4);
-		if (vfat_find_form(dir, name_res) < 0)
-			break;
-		i -= 11;
-	}
-	return 0;
-}
-
-/* Translate a string, including coded sequences into Unicode */
-static int
-xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
-	     int *longlen, int *outlen, int escape, int utf8,
-	     struct nls_table *nls)
-{
-	const unsigned char *ip;
-	unsigned char nc;
-	unsigned char *op;
-	unsigned int ec;
-	int i, k, fill;
-	int charlen;
-
-	if (utf8) {
-		int name_len = strlen(name);
-
-		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
-
-		/*
-		 * We stripped '.'s before and set len appropriately,
-		 * but utf8_mbstowcs doesn't care about len
-		 */
-		*outlen -= (name_len - len);
-
-		if (*outlen > 255)
-			return -ENAMETOOLONG;
-
-		op = &outname[*outlen * sizeof(wchar_t)];
-	} else {
-		if (nls) {
-			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
-			     *outlen += 1)
-			{
-				if (escape && (*ip == ':')) {
-					if (i > len - 5)
-						return -EINVAL;
-					ec = 0;
-					for (k = 1; k < 5; k++) {
-						nc = ip[k];
-						ec <<= 4;
-						if (nc >= '0' && nc <= '9') {
-							ec |= nc - '0';
-							continue;
-						}
-						if (nc >= 'a' && nc <= 'f') {
-							ec |= nc - ('a' - 10);
-							continue;
-						}
-						if (nc >= 'A' && nc <= 'F') {
-							ec |= nc - ('A' - 10);
-							continue;
-						}
-						return -EINVAL;
-					}
-					*op++ = ec & 0xFF;
-					*op++ = ec >> 8;
-					ip += 5;
-					i += 5;
-				} else {
-					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
-						return -EINVAL;
-					ip += charlen;
-					i += charlen;
-					op += 2;
-				}
-			}
-			if (i < len)
-				return -ENAMETOOLONG;
-		} else {
-			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
-			     i++, *outlen += 1)
-			{
-				*op++ = *ip++;
-				*op++ = 0;
-			}
-			if (i < len)
-				return -ENAMETOOLONG;
-		}
-	}
-
-	*longlen = *outlen;
-	if (*outlen % 13) {
-		*op++ = 0;
-		*op++ = 0;
-		*outlen += 1;
-		if (*outlen % 13) {
-			fill = 13 - (*outlen % 13);
-			for (i = 0; i < fill; i++) {
-				*op++ = 0xff;
-				*op++ = 0xff;
-			}
-			*outlen += fill;
-		}
-	}
-
-	return 0;
-}
-
-static int vfat_build_slots(struct inode *dir, const unsigned char *name,
-			    int len, int is_dir, int cluster,
-			    struct timespec *ts,
-			    struct msdos_dir_slot *slots, int *nr_slots)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	struct fat_mount_options *opts = &sbi->options;
-	struct msdos_dir_slot *ps;
-	struct msdos_dir_entry *de;
-	unsigned char cksum, lcase;
-	unsigned char msdos_name[MSDOS_NAME];
-	wchar_t *uname;
-	__le16 time, date;
-	int err, ulen, usize, i;
-	loff_t offset;
-
-	*nr_slots = 0;
-
-	uname = __getname();
-	if (!uname)
-		return -ENOMEM;
-
-	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
-			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
-	if (err)
-		goto out_free;
-
-	err = vfat_is_used_badchars(uname, ulen);
-	if (err)
-		goto out_free;
-
-	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
-				    msdos_name, &lcase);
-	if (err < 0)
-		goto out_free;
-	else if (err == 1) {
-		de = (struct msdos_dir_entry *)slots;
-		err = 0;
-		goto shortname;
-	}
-
-	/* build the entry of long file name */
-	cksum = fat_checksum(msdos_name);
-
-	*nr_slots = usize / 13;
-	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
-		ps->id = i;
-		ps->attr = ATTR_EXT;
-		ps->reserved = 0;
-		ps->alias_checksum = cksum;
-		ps->start = 0;
-		offset = (i - 1) * 13;
-		fatwchar_to16(ps->name0_4, uname + offset, 5);
-		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
-		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
-	}
-	slots[0].id |= 0x40;
-	de = (struct msdos_dir_entry *)ps;
-
-shortname:
-	/* build the entry of 8.3 alias name */
-	(*nr_slots)++;
-	memcpy(de->name, msdos_name, MSDOS_NAME);
-	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
-	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
-	de->time = de->ctime = time;
-	de->date = de->cdate = de->adate = date;
-	de->ctime_cs = 0;
-	de->start = cpu_to_le16(cluster);
-	de->starthi = cpu_to_le16(cluster >> 16);
-	de->size = 0;
-out_free:
-	__putname(uname);
-	return err;
-}
-
-static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
-			  int cluster, struct timespec *ts,
-			  struct fat_slot_info *sinfo)
-{
-	struct msdos_dir_slot *slots;
-	unsigned int len;
-	int err, nr_slots;
-
-	len = vfat_striptail_len(qname);
-	if (len == 0)
-		return -ENOENT;
-
-	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
-	if (slots == NULL)
-		return -ENOMEM;
-
-	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
-			       slots, &nr_slots);
-	if (err)
-		goto cleanup;
-
-	err = fat_add_entries(dir, slots, nr_slots, sinfo);
-	if (err)
-		goto cleanup;
-
-	/* update timestamp */
-	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
-	if (IS_DIRSYNC(dir))
-		(void)fat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
-cleanup:
-	kfree(slots);
-	return err;
-}
-
-static int vfat_find(struct inode *dir, struct qstr *qname,
-		     struct fat_slot_info *sinfo)
-{
-	unsigned int len = vfat_striptail_len(qname);
-	if (len == 0)
-		return -ENOENT;
-	return fat_search_long(dir, qname->name, len, sinfo);
-}
-
-static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	struct dentry *alias;
-	int err, table;
-
-	lock_super(sb);
-	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-	dentry->d_op = &vfat_dentry_ops[table];
-
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err) {
-		table++;
-		goto error;
-	}
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		unlock_super(sb);
-		return ERR_CAST(inode);
-	}
-	alias = d_find_alias(inode);
-	if (alias) {
-		if (d_invalidate(alias) == 0)
-			dput(alias);
-		else {
-			iput(inode);
-			unlock_super(sb);
-			return alias;
-		}
-
-	}
-error:
-	unlock_super(sb);
-	dentry->d_op = &vfat_dentry_ops[table];
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry) {
-		dentry->d_op = &vfat_dentry_ops[table];
-		dentry->d_time = dentry->d_parent->d_inode->i_version;
-	}
-	return dentry;
-}
-
-static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
-		       struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	int err;
-
-	lock_super(sb);
-
-	ts = CURRENT_TIME_SEC;
-	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
-	if (err)
-		goto out;
-	dir->i_version++;
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out;
-	}
-	inode->i_version++;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	d_instantiate(dentry, inode);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-
-	err = fat_dir_empty(inode);
-	if (err)
-		goto out;
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	drop_nlink(dir);
-
-	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-
-	return err;
-}
-
-static int vfat_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-
-	return err;
-}
-
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	int err, cluster;
-
-	lock_super(sb);
-
-	ts = CURRENT_TIME_SEC;
-	cluster = fat_alloc_new_dir(dir, &ts);
-	if (cluster < 0) {
-		err = cluster;
-		goto out;
-	}
-	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
-	if (err)
-		goto out_free;
-	dir->i_version++;
-	inc_nlink(dir);
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		/* the directory was completed, just return a error */
-		goto out;
-	}
-	inode->i_version++;
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	d_instantiate(dentry, inode);
-
-	unlock_super(sb);
-	return 0;
-
-out_free:
-	fat_free_clusters(dir, cluster);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
-	struct inode *old_inode, *new_inode;
-	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
-	int err, is_dir, update_dotdot, corrupt = 0;
-	struct super_block *sb = old_dir->i_sb;
-
-	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
-	old_inode = old_dentry->d_inode;
-	new_inode = new_dentry->d_inode;
-	lock_super(sb);
-	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
-	if (err)
-		goto out;
-
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	ts = CURRENT_TIME_SEC;
-	if (new_inode) {
-		if (is_dir) {
-			err = fat_dir_empty(new_inode);
-			if (err)
-				goto out;
-		}
-		new_i_pos = MSDOS_I(new_inode)->i_pos;
-		fat_detach(new_inode);
-	} else {
-		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
-				     &ts, &sinfo);
-		if (err)
-			goto out;
-		new_i_pos = sinfo.i_pos;
-	}
-	new_dir->i_version++;
-
-	fat_detach(old_inode);
-	fat_attach(old_inode, new_i_pos);
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
-
-	if (update_dotdot) {
-		int start = MSDOS_I(new_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
-		drop_nlink(old_dir);
-		if (!new_inode)
- 			inc_nlink(new_dir);
-	}
-
-	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
-	old_sinfo.bh = NULL;
-	if (err)
-		goto error_dotdot;
-	old_dir->i_version++;
-	old_dir->i_ctime = old_dir->i_mtime = ts;
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		if (is_dir)
-			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
-	}
-out:
-	brelse(sinfo.bh);
-	brelse(dotdot_bh);
-	brelse(old_sinfo.bh);
-	unlock_super(sb);
-
-	return err;
-
-error_dotdot:
-	/* data cluster is shared, serious corruption */
-	corrupt = 1;
-
-	if (update_dotdot) {
-		int start = MSDOS_I(old_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
-	}
-error_inode:
-	fat_detach(old_inode);
-	fat_attach(old_inode, old_sinfo.i_pos);
-	if (new_inode) {
-		fat_attach(new_inode, new_i_pos);
-		if (corrupt)
-			corrupt |= fat_sync_inode(new_inode);
-	} else {
-		/*
-		 * If new entry was not sharing the data cluster, it
-		 * shouldn't be serious corruption.
-		 */
-		int err2 = fat_remove_entries(new_dir, &sinfo);
-		if (corrupt)
-			corrupt |= err2;
-		sinfo.bh = NULL;
-	}
-	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
-			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __func__, sinfo.i_pos);
-	}
-	goto out;
-}
-
-static const struct inode_operations vfat_dir_inode_operations = {
-	.create		= vfat_create,
-	.lookup		= vfat_lookup,
-	.unlink		= vfat_unlink,
-	.mkdir		= vfat_mkdir,
-	.rmdir		= vfat_rmdir,
-	.rename		= vfat_rename,
-	.setattr	= fat_setattr,
-	.getattr	= fat_getattr,
-};
-
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
-{
-	int res;
-
-	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-	if (res)
-		return res;
-
-	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_dentry_ops[0];
-	else
-		sb->s_root->d_op = &vfat_dentry_ops[2];
-
-	return 0;
-}
-
-static int vfat_get_sb(struct file_system_type *fs_type,
-		       int flags, const char *dev_name,
-		       void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
-			   mnt);
-}
-
-static struct file_system_type vfat_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "vfat",
-	.get_sb		= vfat_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_vfat_fs(void)
-{
-	return register_filesystem(&vfat_fs_type);
-}
-
-static void __exit exit_vfat_fs(void)
-{
-	unregister_filesystem(&vfat_fs_type);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("VFAT filesystem support");
-MODULE_AUTHOR("Gordon Chaffee");
-
-module_init(init_vfat_fs)
-module_exit(exit_vfat_fs)
-- 
cgit v1.2.3


From 9e975dae2970d22557662761c8505ce9fd165684 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:46 -0800
Subject: fat: split include/msdos_fs.h

This splits __KERNEL__ stuff in include/msdos_fs.h into fs/fat/fat.h.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c       |   2 +-
 fs/fat/dir.c         |   2 +-
 fs/fat/fat.h         | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fat/fatent.c      |   1 +
 fs/fat/file.c        |   2 +-
 fs/fat/inode.c       |   2 +-
 fs/fat/misc.c        |   2 +-
 fs/fat/namei_msdos.c |   2 +-
 fs/fat/namei_vfat.c  |   3 +-
 9 files changed, 282 insertions(+), 8 deletions(-)
 create mode 100644 fs/fat/fat.h

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3222f51c41c..589edde9053 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,8 +9,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /* this must be > 0. */
 #define FAT_MAX_CACHE	8
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bae1c329252..08b23ad25f1 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,11 +16,11 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include "fat.h"
 
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
new file mode 100644
index 00000000000..51f1c42ca5e
--- /dev/null
+++ b/fs/fat/fat.h
@@ -0,0 +1,274 @@
+#ifndef _FAT_H
+#define _FAT_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/msdos_fs.h>
+
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
+
+struct fat_mount_options {
+	uid_t fs_uid;
+	gid_t fs_gid;
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short codepage;  /* Codepage for shortname conversions */
+	char *iocharset;          /* Charset used for filename input/display */
+	unsigned short shortname; /* flags for shortname display/create rule */
+	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+	unsigned short allow_utime;/* permission for setting the [am]time */
+	unsigned quiet:1,         /* set = fake successful chmods and chowns */
+		 showexec:1,      /* set = only set x bit for com/exe/bat */
+		 sys_immutable:1, /* set = system files are immutable */
+		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
+		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
+		 utf8:1,	  /* Use of UTF-8 character set (Default) */
+		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
+		 flush:1,	  /* write things quickly */
+		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
+		 usefree:1,	  /* Use free_clusters for FAT32 */
+		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+};
+
+#define FAT_HASH_BITS	8
+#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
+#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
+
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+	unsigned short sec_per_clus; /* sectors/cluster */
+	unsigned short cluster_bits; /* log2(cluster_size) */
+	unsigned int cluster_size;   /* cluster size */
+	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+	unsigned short fat_start;
+	unsigned long fat_length;    /* FAT start & length (sec.) */
+	unsigned long dir_start;
+	unsigned short dir_entries;  /* root dir start & entries */
+	unsigned long data_start;    /* first data sector */
+	unsigned long max_cluster;   /* maximum cluster number */
+	unsigned long root_cluster;  /* first cluster of the root directory */
+	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+	struct mutex fat_lock;
+	unsigned int prev_free;      /* previously allocated cluster number */
+	unsigned int free_clusters;  /* -1 if undefined */
+	unsigned int free_clus_valid; /* is free_clusters valid? */
+	struct fat_mount_options options;
+	struct nls_table *nls_disk;  /* Codepage used on disk */
+	struct nls_table *nls_io;    /* Charset used for input and display */
+	const void *dir_ops;		     /* Opaque; default directory operations */
+	int dir_per_block;	     /* dir entries per block */
+	int dir_per_block_bits;	     /* log2(dir_per_block) */
+
+	int fatent_shift;
+	struct fatent_operations *fatent_ops;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+};
+
+#define FAT_CACHE_VALID	0	/* special case for valid cache */
+
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+	spinlock_t cache_lru_lock;
+	struct list_head cache_lru;
+	int nr_caches;
+	/* for avoiding the race between fat_free() and fat_get_cluster() */
+	unsigned int cache_valid_id;
+
+	loff_t mmu_private;
+	int i_start;		/* first cluster or 0 */
+	int i_logstart;		/* logical first cluster */
+	int i_attrs;		/* unused attribute bits */
+	loff_t i_pos;		/* on-disk position of directory entry or 0 */
+	struct hlist_node i_fat_hash;	/* hash by i_location */
+	struct inode vfs_inode;
+};
+
+struct fat_slot_info {
+	loff_t i_pos;		/* on-disk position of directory entry */
+	loff_t slot_off;	/* offset for slot or de start */
+	int nr_slots;		/* number of slots + 1(de) in filename */
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+};
+
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+	return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_attr(struct inode *inode)
+{
+	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
+		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
+		MSDOS_I(inode)->i_attrs;
+}
+
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+	unsigned char s = name[0];
+	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
+	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
+	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
+	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
+	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
+	return s;
+}
+
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+		+ sbi->data_start;
+}
+
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		*dst++ = src[0] | (src[1] << 8);
+		src += 2;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		dst[0] = *src & 0x00FF;
+		dst[1] = (*src & 0xFF00) >> 8;
+		dst += 2;
+		src++;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+			   int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
+
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+			   int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+		    struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+				struct msdos_dir_entry **de, loff_t *i_pos);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+			   struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+
+/* fat/fatent.c */
+struct fat_entry {
+	int entry;
+	union {
+		u8 *ent12_p[2];
+		__le16 *ent16_p;
+		__le32 *ent32_p;
+	} u;
+	int nr_bhs;
+	struct buffer_head *bhs[2];
+};
+
+static inline void fatent_init(struct fat_entry *fatent)
+{
+	fatent->nr_bhs = 0;
+	fatent->entry = 0;
+	fatent->u.ent32_p = NULL;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+	fatent->entry = entry;
+	fatent->u.ent32_p = NULL;
+}
+
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+	int i;
+	fatent->u.ent32_p = NULL;
+	for (i = 0; i < fatent->nr_bhs; i++)
+		brelse(fatent->bhs[i]);
+	fatent->nr_bhs = 0;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+			int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+			 int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+			      int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+
+/* fat/file.c */
+extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+			     unsigned int cmd, unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern void fat_truncate(struct inode *inode);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+
+/* fat/inode.c */
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+			const struct inode_operations *fs_dir_inode_ops, int isvfat);
+
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+		            struct inode *i2);
+/* fat/misc.c */
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
+extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
+			      int tz_utc);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
+#endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index fb98b3d847e..5b5f49061b7 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
 #include <linux/blkdev.h>
+#include "fat.h"
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index ddde37025ca..b21973f266a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,13 +10,13 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include "fat.h"
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2b2eec1283b..3921de2013a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
-#include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
@@ -28,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/log2.h>
 #include <asm/unaligned.h>
+#include "fat.h"
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
 /* if user don't select VFAT, this is undefined. */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 79fb98ad36d..91ad9be18ff 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -8,8 +8,8 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /*
  * fat_fs_panic reports a severe file system problem and sets the file system
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e844b9809d2..c0a4d5cd99b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
+#include "fat.h"
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 155c10b4adb..facf3bf0211 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -16,14 +16,13 @@
  */
 
 #include <linux/module.h>
-
 #include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
+#include "fat.h"
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-- 
cgit v1.2.3


From 7decd1cb0305b97243f283fa7f4baf5fe613edeb Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:47 -0800
Subject: fat: Fix and cleanup timestamp conversion

This cleans date_dos2unix()/fat_date_unix2dos() up. New code should be
much more readable.

And this fixes those old functions. Those doesn't handle 2100
correctly. 2100 isn't leap year, but old one handles it as leap year.
Also, with this, centi sec is handled and is fixed.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c         |   6 ++-
 fs/fat/fat.h         |   7 +--
 fs/fat/inode.c       |  34 ++++--------
 fs/fat/misc.c        | 148 +++++++++++++++++++++++++++++++++++++--------------
 fs/fat/namei_msdos.c |   2 +-
 fs/fat/namei_vfat.c  |   5 +-
 6 files changed, 130 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 08b23ad25f1..a601c6d45bc 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1089,6 +1089,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	struct msdos_dir_entry *de;
 	sector_t blknr;
 	__le16 date, time;
+	u8 time_cs;
 	int err, cluster;
 
 	err = fat_alloc_clusters(dir, &cluster, 1);
@@ -1102,7 +1103,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 		goto error_free;
 	}
 
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
 	/* filling the new directory slots ("." and ".." entries) */
@@ -1112,13 +1113,14 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	de[0].lcase = de[1].lcase = 0;
 	de[0].time = de[1].time = time;
 	de[0].date = de[1].date = date;
-	de[0].ctime_cs = de[1].ctime_cs = 0;
 	if (sbi->options.isvfat) {
 		/* extra timestamps */
 		de[0].ctime = de[1].ctime = time;
+		de[0].ctime_cs = de[1].ctime_cs = time_cs;
 		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
 	} else {
 		de[0].ctime = de[1].ctime = 0;
+		de[0].ctime_cs = de[1].ctime_cs = 0;
 		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
 	}
 	de[0].start = cpu_to_le16(cluster);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 51f1c42ca5e..a2a570f8171 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -263,9 +263,10 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
-extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
-			      int tz_utc);
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 __time, __le16 __date, u8 time_cs);
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 *time, __le16 *date, u8 *time_cs);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
 int fat_cache_init(void);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3921de2013a..079d9d5e0d3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -381,22 +381,12 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
-	inode->i_mtime.tv_sec =
-		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
-			      sbi->options.tz_utc);
-	inode->i_mtime.tv_nsec = 0;
+
+	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
 	if (sbi->options.isvfat) {
-		int secs = de->ctime_cs / 100;
-		int csecs = de->ctime_cs % 100;
-		inode->i_ctime.tv_sec  =
-			date_dos2unix(le16_to_cpu(de->ctime),
-				      le16_to_cpu(de->cdate),
-				      sbi->options.tz_utc) + secs;
-		inode->i_ctime.tv_nsec = csecs * 10000000;
-		inode->i_atime.tv_sec =
-			date_dos2unix(0, le16_to_cpu(de->adate),
-				      sbi->options.tz_utc);
-		inode->i_atime.tv_nsec = 0;
+		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
+				  de->cdate, de->ctime_cs);
+		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
 
@@ -591,16 +581,14 @@ retry:
 	raw_entry->attr = fat_attr(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
-			  &raw_entry->date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
-				  &raw_entry->cdate, sbi->options.tz_utc);
-		fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
-				  &raw_entry->adate, sbi->options.tz_utc);
-		raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
-			inode->i_ctime.tv_nsec / 10000000;
+		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
+				  &raw_entry->cdate, &raw_entry->ctime_cs);
+		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
+				  &raw_entry->adate, NULL);
 	}
 	spin_unlock(&sbi->inode_hash_lock);
 	mark_buffer_dirty(bh);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 91ad9be18ff..a191e79e66a 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -135,65 +135,131 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 
 extern struct timezone sys_tz;
 
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits :     value
+ * date:  0 -  4: day	(1 -  31)
+ * date:  5 -  8: month	(1 -  12)
+ * date:  9 - 15: year	(0 - 127) from 1980
+ * time:  0 -  4: sec	(0 -  29) 2sec counts
+ * time:  5 - 10: min	(0 -  59)
+ * time: 11 - 15: hour	(0 -  23)
+ */
+#define SECS_PER_MIN	60
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+#define UNIX_SECS_1980	315532800L
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108	4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define YEAR_2100	120
+#define IS_LEAP_YEAR(y)	(!((y) & 3) && (y) != YEAR_2100)
+
 /* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] = {
-   /* Jan  Feb  Mar  Apr   May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
-	0,  31,  59,  90,  120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0
+static time_t days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
 };
 
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 __time, __le16 __date, u8 time_cs)
 {
-	int month, year, secs;
+	u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
+	time_t second, day, leap_day, month, year;
 
-	/*
-	 * first subtract and mask after that... Otherwise, if
-	 * date == 0, bad things happen
-	 */
-	month = ((date >> 5) - 1) & 15;
-	year = date >> 9;
-	secs = (time & 31)*2+60*((time >> 5) & 63)+(time >> 11)*3600+86400*
-	    ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
-	    month < 2 ? 1 : 0)+3653);
-			/* days since 1.1.70 plus 80's leap day */
-	if (!tz_utc)
-		secs += sys_tz.tz_minuteswest*60;
-	return secs;
+	year  = date >> 9;
+	month = max(1, (date >> 5) & 0xf);
+	day   = max(1, date & 0x1f) - 1;
+
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (IS_LEAP_YEAR(year) && month > 2)
+		leap_day++;
+
+	second =  (time & 0x1f) << 1;
+	second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
+	second += (time >> 11) * SECS_PER_HOUR;
+	second += (year * 365 + leap_day
+		   + days_in_year[month] + day
+		   + DAYS_DELTA) * SECS_PER_DAY;
+
+	if (!sbi->options.tz_utc)
+		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	if (time_cs) {
+		ts->tv_sec = second + (time_cs / 100);
+		ts->tv_nsec = (time_cs % 100) * 10000000;
+	} else {
+		ts->tv_sec = second;
+		ts->tv_nsec = 0;
+	}
 }
 
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
+/* Convert linear UNIX date to a FAT time/date pair. */
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 *time, __le16 *date, u8 *time_cs)
 {
-	int day, year, nl_day, month;
+	time_t second = ts->tv_sec;
+	time_t day, leap_day, month, year;
 
-	if (!tz_utc)
-		unix_date -= sys_tz.tz_minuteswest*60;
+	if (!sbi->options.tz_utc)
+		second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
 
 	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-	if (unix_date < 315532800)
-		unix_date = 315532800;
-
-	*time = cpu_to_le16((unix_date % 60)/2+(((unix_date/60) % 60) << 5)+
-	    (((unix_date/3600) % 24) << 11));
-	day = unix_date/86400-3652;
-	year = day/365;
-	if ((year+3)/4+365*year > day)
+	if (second < UNIX_SECS_1980) {
+		*time = 0;
+		*date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
+		if (time_cs)
+			*time_cs = 0;
+		return;
+	}
+#if BITS_PER_LONG == 64
+	if (second >= UNIX_SECS_2108) {
+		*time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
+		*date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
+		if (time_cs)
+			*time_cs = 199;
+		return;
+	}
+#endif
+
+	day = second / SECS_PER_DAY - DAYS_DELTA;
+	year = day / 365;
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (year * 365 + leap_day > day)
 		year--;
-	day -= (year+3)/4+365*year;
-	if (day == 59 && !(year & 3)) {
-		nl_day = day;
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	day -= year * 365 + leap_day;
+
+	if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
 		month = 2;
 	} else {
-		nl_day = (year & 3) || day <= 59 ? day : day-1;
-		for (month = 0; month < 12; month++) {
-			if (day_n[month] > nl_day)
+		if (IS_LEAP_YEAR(year) && day > days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (days_in_year[month + 1] > day)
 				break;
 		}
 	}
-	*date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
-}
+	day -= days_in_year[month];
 
-EXPORT_SYMBOL_GPL(fat_date_unix2dos);
+	*time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
+			    | ((second / SECS_PER_MIN) % 60) << 5
+			    | (second % SECS_PER_MIN) >> 1);
+	*date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
+	if (time_cs)
+		*time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
+}
+EXPORT_SYMBOL_GPL(fat_time_unix2fat);
 
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c0a4d5cd99b..e92e8158eba 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -247,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (is_hid)
 		de.attr |= ATTR_HIDDEN;
 	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, NULL);
 	de.cdate = de.adate = 0;
 	de.ctime = 0;
 	de.ctime_cs = 0;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index facf3bf0211..1536bc3ca0f 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -568,6 +568,7 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name,
 	unsigned char msdos_name[MSDOS_NAME];
 	wchar_t *uname;
 	__le16 time, date;
+	u8 time_cs;
 	int err, ulen, usize, i;
 	loff_t offset;
 
@@ -620,10 +621,10 @@ shortname:
 	memcpy(de->name, msdos_name, MSDOS_NAME);
 	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
 	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 	de->time = de->ctime = time;
 	de->date = de->cdate = de->adate = date;
-	de->ctime_cs = 0;
+	de->ctime_cs = time_cs;
 	de->start = cpu_to_le16(cluster);
 	de->starthi = cpu_to_le16(cluster >> 16);
 	de->size = 0;
-- 
cgit v1.2.3


From 53472bc8f810d2fb507593ea03703670506a668d Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:47 -0800
Subject: fat: use generic_file_llseek() for directory

Since fat_dir_ioctl() was already fixed (i.e. called under ->i_mutex),
and __fat_readdir() doesn't take BKL anymore. So, BKL for ->llseek()
is pointless, and we have to use generic_file_llseek().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index a601c6d45bc..931dd28b528 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -832,6 +832,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 #endif /* CONFIG_COMPAT */
 
 const struct file_operations fat_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
 	.ioctl		= fat_dir_ioctl,
-- 
cgit v1.2.3


From 52e9d9f4b32a3bec91feb76c84e37b7dcffe5040 Mon Sep 17 00:00:00 2001
From: Darren Jenkins <darrenrjenkins@gmail.com>
Date: Thu, 6 Nov 2008 12:53:48 -0800
Subject: fat: cleanup fat_parse_long() error handling

Coverity CID 2332 & 2333 RESOURCE_LEAK

In fat_search_long() if fat_parse_long() returns a -ve value we return
without first freeing unicode.  This patch free's them on this error path.

The above was false positive on current tree, but this change is more
clean, so apply as cleanup.

[hirofumi@mail.parknet.co.jp: fix coding style]
Signed-off-by: Darren Jenkins <darrenrjenkins@gmail.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 931dd28b528..140fc39e230 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -373,9 +373,10 @@ parse_record:
 		if (de->attr == ATTR_EXT) {
 			int status = fat_parse_long(inode, &cpos, &bh, &de,
 						    &unicode, &nr_slots);
-			if (status < 0)
-				return status;
-			else if (status == PARSE_INVALID)
+			if (status < 0) {
+				err = status;
+				goto end_of_dir;
+			} else if (status == PARSE_INVALID)
 				continue;
 			else if (status == PARSE_NOT_LONGNAME)
 				goto parse_record;
-- 
cgit v1.2.3


From d3dfa8228f87ab9960ab8b4718013d68e3c25a43 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:49 -0800
Subject: fat: improve fat_hash()

fat_hash() is using the algorithm known as bad. Instead of it, this
uses hash_32(). The following is the summary of test.

old hash:
	hash func (1000 times): 33489 cycles
	total inodes in hash table: 70926
	largest bucket contains: 696
	smallest bucket contains: 54

new hash:
	hash func (1000 times): 33129 cycles
	total inodes in hash table: 70926
	largest bucket contains: 315
	smallest bucket contains: 236

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   |  1 -
 fs/fat/inode.c | 18 +++++++-----------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a2a570f8171..2b8e94c3eef 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -43,7 +43,6 @@ struct fat_mount_options {
 
 #define FAT_HASH_BITS	8
 #define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
-#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
 
 /*
  * MS-DOS file system in-core superblock data
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 079d9d5e0d3..f58cd48d98b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -26,6 +26,7 @@
 #include <linux/uio.h>
 #include <linux/writeback.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
@@ -247,25 +248,21 @@ static void fat_hash_init(struct super_block *sb)
 		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
 }
 
-static inline unsigned long fat_hash(struct super_block *sb, loff_t i_pos)
+static inline unsigned long fat_hash(loff_t i_pos)
 {
-	unsigned long tmp = (unsigned long)i_pos | (unsigned long) sb;
-	tmp = tmp + (tmp >> FAT_HASH_BITS) + (tmp >> FAT_HASH_BITS * 2);
-	return tmp & FAT_HASH_MASK;
+	return hash_32(i_pos, FAT_HASH_BITS);
 }
 
 void fat_attach(struct inode *inode, loff_t i_pos)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 
 	spin_lock(&sbi->inode_hash_lock);
 	MSDOS_I(inode)->i_pos = i_pos;
-	hlist_add_head(&MSDOS_I(inode)->i_fat_hash,
-			sbi->inode_hashtable + fat_hash(sb, i_pos));
+	hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
 	spin_unlock(&sbi->inode_hash_lock);
 }
-
 EXPORT_SYMBOL_GPL(fat_attach);
 
 void fat_detach(struct inode *inode)
@@ -276,13 +273,12 @@ void fat_detach(struct inode *inode)
 	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
 	spin_unlock(&sbi->inode_hash_lock);
 }
-
 EXPORT_SYMBOL_GPL(fat_detach);
 
 struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct hlist_head *head = sbi->inode_hashtable + fat_hash(sb, i_pos);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 	struct hlist_node *_p;
 	struct msdos_inode_info *i;
 	struct inode *inode = NULL;
-- 
cgit v1.2.3


From 5e35dd4651002207948f10c576fc7d9bad448815 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:49 -0800
Subject: fat: Fix fat_ent_update_ptr() for FAT12

This fixes the missing update for bhs/nr_bhs in case the caller
accessed from block boundary to first block of boundary.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fatent.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 5b5f49061b7..13513992da3 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -317,10 +317,20 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
 	/* Is this fatent's blocks including this entry? */
 	if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
 		return 0;
-	/* Does this entry need the next block? */
-	if (sbi->fat_bits == 12 && (offset + 1) >= sb->s_blocksize) {
-		if (fatent->nr_bhs != 2 || bhs[1]->b_blocknr != (blocknr + 1))
-			return 0;
+	if (sbi->fat_bits == 12) {
+		if ((offset + 1) < sb->s_blocksize) {
+			/* This entry is on bhs[0]. */
+			if (fatent->nr_bhs == 2) {
+				brelse(bhs[1]);
+				fatent->nr_bhs = 1;
+			}
+		} else {
+			/* This entry needs the next block. */
+			if (fatent->nr_bhs != 2)
+				return 0;
+			if (bhs[1]->b_blocknr != (blocknr + 1))
+				return 0;
+		}
 	}
 	ops->ent_set_ptr(fatent, offset);
 	return 1;
-- 
cgit v1.2.3


From a993b542bb4cd3e5a64863b7ef892bbebec2239b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:50 -0800
Subject: fat: use fat_detach() in fat_clear_inode()

Use fat_detach() instead of opencoding it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index f58cd48d98b..8e1b75c63c7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -429,13 +429,8 @@ static void fat_delete_inode(struct inode *inode)
 
 static void fat_clear_inode(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-
-	spin_lock(&sbi->inode_hash_lock);
 	fat_cache_inval_inode(inode);
-	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
-	spin_unlock(&sbi->inode_hash_lock);
+	fat_detach(inode);
 }
 
 static void fat_write_super(struct super_block *sb)
-- 
cgit v1.2.3


From 068f5ae05c51d2cee6b31cb3da06775dd83bd348 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:51 -0800
Subject: vfat: Fix vfat_find() error path in vfat_lookup()

Current vfat_lookup() creates negetive dentry blindly if vfat_find()
returned a error. It's wrong. If the error isn't -ENOENT, just return
error.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 1536bc3ca0f..419deabfb9b 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -683,7 +683,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct dentry *alias;
 	int err, table;
 
@@ -693,14 +693,18 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err) {
-		table++;
+		if (err == -ENOENT) {
+			table++;
+			inode = NULL;
+			goto out;
+		}
 		goto error;
 	}
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		unlock_super(sb);
-		return ERR_CAST(inode);
+		err = PTR_ERR(inode);
+		goto error;
 	}
 	alias = d_find_alias(inode);
 	if (alias) {
@@ -713,7 +717,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		}
 
 	}
-error:
+out:
 	unlock_super(sb);
 	dentry->d_op = &vfat_dentry_ops[table];
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
@@ -723,6 +727,10 @@ error:
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
+
+error:
+	unlock_super(sb);
+	return ERR_PTR(err);
 }
 
 static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
-- 
cgit v1.2.3


From 1b52467243c7167b3a267ddbcbb14d550f28eb4a Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:51 -0800
Subject: fat: Fix/Cleanup dcache handling for vfat

- Add comments for handling dcache of vfat.

- Separate case-sensitive case and case-insensitive to
  vfat_revalidate() and vfat_ci_revalidate().

  vfat_revalidate() doesn't need to drop case-insensitive negative
  dentry on creation path.

- Current code is missing to set ->d_revalidate to the negative dentry
  created by unlink/etc..

  This sets ->d_revalidate always, and returns 1 for positive
  dentry. Now, we don't need to change ->d_op dynamically anymore,
  so this just uses sb->s_root->d_op to set ->d_op.

- d_find_alias() may return DCACHE_DISCONNECTED dentry. It's not
  the interesting dentry there. This checks it.

- Add missing LOOKUP_PARENT check. We don't need to drop the valid
  negative dentry for (LOOKUP_CREATE | LOOKUP_PARENT) lookup.

- For consistent filename on creation path, this drops negative dentry
  if we can't see intent.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 124 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 419deabfb9b..d585398f9f6 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -24,27 +24,67 @@
 #include <linux/namei.h>
 #include "fat.h"
 
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+/*
+ * If new entry was created in the parent, it could create the 8.3
+ * alias (the shortname of logname).  So, the parent may have the
+ * negative-dentry which matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative
+ * anymore.  So, drop it.
+ */
+static int vfat_revalidate_shortname(struct dentry *dentry)
 {
 	int ret = 1;
-
-	if (!dentry->d_inode &&
-	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-		/*
-		 * negative dentry is dropped, in order to make sure
-		 * to use the name which a user desires if this is
-		 * create path.
-		 */
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_time != dentry->d_parent->d_inode->i_version)
 		ret = 0;
-	else {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-			ret = 0;
-		spin_unlock(&dentry->d_lock);
-	}
+	spin_unlock(&dentry->d_lock);
 	return ret;
 }
 
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	/* This is not negative dentry. Always valid. */
+	if (dentry->d_inode)
+		return 1;
+	return vfat_revalidate_shortname(dentry);
+}
+
+static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+{
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (dentry->d_inode)
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & LOOKUP_CREATE)
+			return 0;
+	}
+
+	return vfat_revalidate_shortname(dentry);
+}
+
 /* returns the length of a struct qstr, ignoring trailing dots */
 static unsigned int vfat_striptail_len(struct qstr *qstr)
 {
@@ -126,25 +166,16 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	return 1;
 }
 
-static struct dentry_operations vfat_dentry_ops[4] = {
-	{
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	}
+static struct dentry_operations vfat_ci_dentry_ops = {
+	.d_revalidate	= vfat_revalidate_ci,
+	.d_hash		= vfat_hashi,
+	.d_compare	= vfat_cmpi,
+};
+
+static struct dentry_operations vfat_dentry_ops = {
+	.d_revalidate	= vfat_revalidate,
+	.d_hash		= vfat_hash,
+	.d_compare	= vfat_cmp,
 };
 
 /* Characters that are undesirable in an MS-DOS file name */
@@ -685,29 +716,35 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	struct fat_slot_info sinfo;
 	struct inode *inode;
 	struct dentry *alias;
-	int err, table;
+	int err;
 
 	lock_super(sb);
-	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-	dentry->d_op = &vfat_dentry_ops[table];
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err) {
 		if (err == -ENOENT) {
-			table++;
 			inode = NULL;
 			goto out;
 		}
 		goto error;
 	}
+
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto error;
 	}
+
 	alias = d_find_alias(inode);
-	if (alias) {
+	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+		/*
+		 * This inode has non DCACHE_DISCONNECTED dentry. This
+		 * means, the user did ->lookup() by an another name
+		 * (longname vs 8.3 alias of it) in past.
+		 *
+		 * Switch to new one for reason of locality if possible.
+		 */
 		if (d_invalidate(alias) == 0)
 			dput(alias);
 		else {
@@ -715,15 +752,14 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 			unlock_super(sb);
 			return alias;
 		}
-
 	}
 out:
 	unlock_super(sb);
-	dentry->d_op = &vfat_dentry_ops[table];
+	dentry->d_op = sb->s_root->d_op;
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry) {
-		dentry->d_op = &vfat_dentry_ops[table];
+		dentry->d_op = sb->s_root->d_op;
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
@@ -1022,9 +1058,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 		return res;
 
 	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_dentry_ops[0];
+		sb->s_root->d_op = &vfat_ci_dentry_ops;
 	else
-		sb->s_root->d_op = &vfat_dentry_ops[2];
+		sb->s_root->d_op = &vfat_dentry_ops;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1c13a243a461dd5b089d29e5d57f260c990e462c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:52 -0800
Subject: fat: Kill d_invalidate() in vfat_lookup()

d_invalidate() for positive dentry doesn't work in some cases
(vfsmount, nfsd, and maybe others). shrink_dcache_parent() by
d_invalidate() is pointless for vfat usage at all.

So, this kills it, and intead of it uses d_move().

To save old behavior, this returns alias simply for directory (don't
change pwd, etc..). the directory lookup shouldn't be important for
performance.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index d585398f9f6..bf326d4356a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -745,13 +745,12 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		 *
 		 * Switch to new one for reason of locality if possible.
 		 */
-		if (d_invalidate(alias) == 0)
-			dput(alias);
-		else {
-			iput(inode);
-			unlock_super(sb);
-			return alias;
-		}
+		BUG_ON(d_unhashed(alias));
+		if (!S_ISDIR(inode->i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		unlock_super(sb);
+		return alias;
 	}
 out:
 	unlock_super(sb);
-- 
cgit v1.2.3


From 45cfbe354785a5bc9a38354754d6f7322f598001 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:53 -0800
Subject: fat: Cleanup msdos_lookup()

Use same style with vfat_lookup().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_msdos.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e92e8158eba..7ba03a4acbe 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -203,33 +203,37 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	int res;
-
-	dentry->d_op = &msdos_dentry_operations;
+	struct inode *inode;
+	int err;
 
 	lock_super(sb);
-	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (res == -ENOENT)
-		goto add;
-	if (res < 0)
-		goto out;
+
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto out;
+		err = PTR_ERR(inode);
+		goto error;
 	}
-add:
-	res = 0;
+out:
+	unlock_super(sb);
+	dentry->d_op = &msdos_dentry_operations;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry)
 		dentry->d_op = &msdos_dentry_operations;
-out:
+	return dentry;
+
+error:
 	unlock_super(sb);
-	if (!res)
-		return dentry;
-	return ERR_PTR(res);
+	return ERR_PTR(err);
 }
 
 /***** Creates a directory entry (name is already formatted). */
-- 
cgit v1.2.3


From 9c0aa1b87bf541affef519eb4879ce7c5a5941ae Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Cleanup FAT attribute stuff

This adds three helpers:

fat_make_attrs() - makes FAT attributes from inode.
fat_make_mode()  - makes mode_t from FAT attributes.
fat_save_attrs() - saves FAT attributes to inode.

Then this replaces: MSDOS_MKMODE() by fat_make_mode(), fat_attr() by
fat_make_attrs(), ->i_attrs = attr & ATTR_UNUSED by fat_save_attrs().
And for root inode, those is used with ATTR_DIR instead of bogus
ATTR_NONE.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   | 20 +++++++++++++++++++-
 fs/fat/file.c  | 32 ++++++++++++--------------------
 fs/fat/inode.c | 19 +++++++++----------
 3 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 2b8e94c3eef..3b4753a024e 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,14 +117,32 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
+				   u8 attrs, mode_t mode)
+{
+	if (attrs & ATTR_RO)
+		mode &= ~S_IWUGO;
+
+	if (attrs & ATTR_DIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
 /* Return the FAT attribute byte for this inode */
-static inline u8 fat_attr(struct inode *inode)
+static inline u8 fat_make_attrs(struct inode *inode)
 {
 	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
 		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
 		MSDOS_I(inode)->i_attrs;
 }
 
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+}
+
 static inline unsigned char fat_checksum(const __u8 *name)
 {
 	unsigned char s = name[0];
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b21973f266a..f5a7e907a8f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,13 +27,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr;
-
-		if (inode->i_ino == MSDOS_ROOT_INO)
-			attr = ATTR_DIR;
-		else
-			attr = fat_attr(inode);
-
+		u32 attr = fat_make_attrs(inode);
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
@@ -62,20 +56,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		/* Merge in ATTR_VOLUME and ATTR_DIR */
 		attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
 			(is_dir ? ATTR_DIR : 0);
-		oldattr = fat_attr(inode);
+		oldattr = fat_make_attrs(inode);
 
 		/* Equivalent to a chmod() */
 		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
 		ia.ia_ctime = current_fs_time(inode->i_sb);
-		if (is_dir) {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				S_IRWXUGO & ~sbi->options.fs_dmask)
-				| S_IFDIR;
-		} else {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				(S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO))
-				& ~sbi->options.fs_fmask)
-				| S_IFREG;
+		if (is_dir)
+			ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+		else {
+			ia.ia_mode = fat_make_mode(sbi, attr,
+				S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
 		}
 
 		/* The root directory has no attributes */
@@ -115,7 +105,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 				inode->i_flags &= S_IMMUTABLE;
 		}
 
-		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
+		fat_save_attrs(inode, attr);
 		mark_inode_dirty(inode);
 up:
 		mnt_drop_write(filp->f_path.mnt);
@@ -274,7 +264,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 
 	/*
 	 * Note, the basic check is already done by a caller of
-	 * (attr->ia_mode & ~MSDOS_VALID_MODE)
+	 * (attr->ia_mode & ~FAT_VALID_MODE)
 	 */
 
 	if (S_ISREG(inode->i_mode))
@@ -314,6 +304,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 }
 
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -356,7 +348,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
-	     (attr->ia_mode & ~MSDOS_VALID_MODE)))
+	     (attr->ia_mode & ~FAT_VALID_MODE)))
 		error = -EPERM;
 
 	if (error) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8e1b75c63c7..7aaa21cf019 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -337,8 +337,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 
 	if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
 		inode->i_generation &= ~1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-			S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+		inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
 		inode->i_op = sbi->dir_ops;
 		inode->i_fop = &fat_dir_operations;
 
@@ -355,10 +354,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		inode->i_nlink = fat_subdirs(inode);
 	} else { /* not a directory */
 		inode->i_generation |= 1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-		    ((sbi->options.showexec && !is_exec(de->name + 8))
-			? S_IRUGO|S_IWUGO : S_IRWXUGO)
-		    & ~sbi->options.fs_fmask) | S_IFREG;
+		inode->i_mode = fat_make_mode(sbi, de->attr,
+			((sbi->options.showexec && !is_exec(de->name + 8))
+			 ? S_IRUGO|S_IWUGO : S_IRWXUGO));
 		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
 		if (sbi->fat_bits == 32)
 			MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
@@ -374,7 +372,8 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		if (sbi->options.sys_immutable)
 			inode->i_flags |= S_IMMUTABLE;
 	}
-	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
+	fat_save_attrs(inode, de->attr);
+
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
@@ -569,7 +568,7 @@ retry:
 		raw_entry->size = 0;
 	else
 		raw_entry->size = cpu_to_le32(inode->i_size);
-	raw_entry->attr = fat_attr(inode);
+	raw_entry->attr = fat_make_attrs(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
 	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
@@ -1105,7 +1104,7 @@ static int fat_read_root(struct inode *inode)
 	inode->i_gid = sbi->options.fs_gid;
 	inode->i_version++;
 	inode->i_generation = 0;
-	inode->i_mode = (S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
 	inode->i_op = sbi->dir_ops;
 	inode->i_fop = &fat_dir_operations;
 	if (sbi->fat_bits == 32) {
@@ -1122,7 +1121,7 @@ static int fat_read_root(struct inode *inode)
 	MSDOS_I(inode)->i_logstart = 0;
 	MSDOS_I(inode)->mmu_private = inode->i_size;
 
-	MSDOS_I(inode)->i_attrs = ATTR_NONE;
+	fat_save_attrs(inode, ATTR_DIR);
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
 	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
 	inode->i_nlink = fat_subdirs(inode)+2;
-- 
cgit v1.2.3


From 9183482f5d4a2de00f66641b974e7f351d41b675 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Fix ATTR_RO in the case of (~umask & S_WUGO) == 0

If inode->i_mode doesn't have S_WUGO, current code assumes it means
ATTR_RO.  However, if (~[ufd]mask & S_WUGO) == 0, inode->i_mode can't
hold S_WUGO. Therefore the updated directory entry will always have
ATTR_RO.

This adds fat_mode_can_hold_ro() to check it. And if inode->i_mode
can't hold, uses -i_attrs to hold ATTR_RO instead.

With this, we don't set ATTR_RO unless users change it via ioctl() if
(~[ufd]mask & S_WUGO) == 0.

And on FAT_IOCTL_GET_ATTRIBUTES path, this adds ->i_mutex to it for
not returning the partially updated attributes by FAT_IOCTL_SET_ATTRIBUTES
to userland.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h  | 33 +++++++++++++++++++++++++++++----
 fs/fat/file.c |  7 ++++++-
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 3b4753a024e..313b645b812 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,6 +117,25 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ */
+static inline int fat_mode_can_hold_ro(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	mode_t mask;
+
+	if (S_ISDIR(inode->i_mode))
+		mask = ~sbi->options.fs_dmask;
+	else
+		mask = ~sbi->options.fs_fmask;
+
+	if (!(mask & S_IWUGO))
+		return 0;
+	return 1;
+}
+
 /* Convert attribute bits and a mask to the UNIX mode. */
 static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 				   u8 attrs, mode_t mode)
@@ -133,14 +152,20 @@ static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 /* Return the FAT attribute byte for this inode */
 static inline u8 fat_make_attrs(struct inode *inode)
 {
-	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
-		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
-		MSDOS_I(inode)->i_attrs;
+	u8 attrs = MSDOS_I(inode)->i_attrs;
+	if (S_ISDIR(inode->i_mode))
+		attrs |= ATTR_DIR;
+	if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		attrs |= ATTR_RO;
+	return attrs;
 }
 
 static inline void fat_save_attrs(struct inode *inode, u8 attrs)
 {
-	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	if (fat_mode_can_hold_ro(inode))
+		MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	else
+		MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
 }
 
 static inline unsigned char fat_checksum(const __u8 *name)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f5a7e907a8f..81b15c62380 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,7 +27,12 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr = fat_make_attrs(inode);
+		u32 attr;
+
+		mutex_lock(&inode->i_mutex);
+		attr = fat_make_attrs(inode);
+		mutex_unlock(&inode->i_mutex);
+
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
-- 
cgit v1.2.3


From dfc209c0064efef5590f608056a48b61a5cac09c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:55 -0800
Subject: fat: Fix ATTR_RO for directory

FAT has the ATTR_RO (read-only) attribute. But on Windows, the ATTR_RO
of the directory will be just ignored actually, and is used by only
applications as flag. E.g. it's setted for the customized folder by
Explorer.

http://msdn2.microsoft.com/en-us/library/aa969337.aspx

This adds "rodir" option. If user specified it, ATTR_RO is used as
read-only flag even if it's the directory. Otherwise, inode->i_mode
is not used to hold ATTR_RO (i.e. fat_mode_can_save_ro() returns 0).

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   | 14 ++++++++++----
 fs/fat/file.c  | 16 ++++++++++++----
 fs/fat/inode.c | 17 +++++++++++++----
 3 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 313b645b812..e9dce5d8e7a 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -38,7 +38,8 @@ struct fat_mount_options {
 		 flush:1,	  /* write things quickly */
 		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
 		 usefree:1,	  /* Use free_clusters for FAT32 */
-		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+		 tz_utc:1,	  /* Filesystem timestamps are in UTC */
+		 rodir:1;	  /* allow ATTR_RO for directory */
 };
 
 #define FAT_HASH_BITS	8
@@ -120,15 +121,20 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 /*
  * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
  * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
  */
 static inline int fat_mode_can_hold_ro(struct inode *inode)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	mode_t mask;
 
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
+		if (!sbi->options.rodir)
+			return 0;
 		mask = ~sbi->options.fs_dmask;
-	else
+	} else
 		mask = ~sbi->options.fs_fmask;
 
 	if (!(mask & S_IWUGO))
@@ -140,7 +146,7 @@ static inline int fat_mode_can_hold_ro(struct inode *inode)
 static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 				   u8 attrs, mode_t mode)
 {
-	if (attrs & ATTR_RO)
+	if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
 		mode &= ~S_IWUGO;
 
 	if (attrs & ATTR_DIR)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 81b15c62380..f06a4e525ec 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -282,11 +282,18 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 	/*
 	 * Of the r and x bits, all (subject to umask) must be present. Of the
 	 * w bits, either all (subject to umask) or none must be present.
+	 *
+	 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
 	 */
 	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
 		return -EPERM;
-	if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
-		return -EPERM;
+	if (fat_mode_can_hold_ro(inode)) {
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
 
 	*mode_ptr &= S_IFMT | perm;
 
@@ -316,8 +323,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
-	int error = 0;
 	unsigned int ia_valid;
+	int error;
 
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
@@ -371,7 +378,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_valid &= ~ATTR_MODE;
 	}
 
-	error = inode_setattr(inode, attr);
+	if (attr->ia_valid)
+		error = inode_setattr(inode, attr);
 out:
 	return error;
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7aaa21cf019..0da04e6d1e3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -797,8 +797,10 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 			seq_puts(m, ",uni_xlate");
 		if (!opts->numtail)
 			seq_puts(m, ",nonumtail");
+		if (opts->rodir)
+			seq_puts(m, ",rodir");
 	}
-	if (sbi->options.flush)
+	if (opts->flush)
 		seq_puts(m, ",flush");
 	if (opts->tz_utc)
 		seq_puts(m, ",tz=UTC");
@@ -814,7 +816,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -886,6 +888,7 @@ static const match_table_t vfat_tokens = {
 	{Opt_nonumtail_yes, "nonumtail=yes"},
 	{Opt_nonumtail_yes, "nonumtail=true"},
 	{Opt_nonumtail_yes, "nonumtail"},
+	{Opt_rodir, "rodir"},
 	{Opt_err, NULL}
 };
 
@@ -905,10 +908,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
-	if (is_vfat)
+	if (is_vfat) {
 		opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
-	else
+		opts->rodir = 0;
+	} else {
 		opts->shortname = 0;
+		opts->rodir = 1;
+	}
 	opts->name_check = 'n';
 	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
 	opts->utf8 = opts->unicode_xlate = 0;
@@ -1059,6 +1065,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_nonumtail_yes:		/* empty or 1 or yes or true */
 			opts->numtail = 0;	/* negated option */
 			break;
+		case Opt_rodir:
+			opts->rodir = 1;
+			break;
 
 		/* obsolete mount options */
 		case Opt_obsolate:
-- 
cgit v1.2.3


From fa93ca18a8b0da4e26bd9491ad144cd14d22f8ec Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:56 -0800
Subject: fat: Fix _fat_bmap() race

fat_get_cluster() assumes the requested blocknr isn't truncated during
read. _fat_bmap() doesn't follow this rule.

This protects it by ->i_mutex.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0da04e6d1e3..be88208b83a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -199,7 +199,14 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
-	return generic_block_bmap(mapping, block, fat_get_block);
+	sector_t blocknr;
+
+	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
+	mutex_lock(&mapping->host->i_mutex);
+	blocknr = generic_block_bmap(mapping, block, fat_get_block);
+	mutex_unlock(&mapping->host->i_mutex);
+
+	return blocknr;
 }
 
 static const struct address_space_operations fat_aops = {
-- 
cgit v1.2.3


From 0e75f5da06c05425f4b375eb981c4489fb2d9787 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:56 -0800
Subject: fat: Add printf attribute to fat_fs_panic()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e9dce5d8e7a..a69f7f9757c 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -308,7 +308,8 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
-- 
cgit v1.2.3


From 2bdf67eb1631f30e2f3f5d49e4007c76e88877a8 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:57 -0800
Subject: fat: mmu_private race fix

mmu_private is 64bits value, hence it's not atomic to update.

So, the access rule for mmu_private is we must hold ->i_mutex.  But,
fat_get_block() path doesn't follow the rule on non-allocation path.

This fixes by using i_size instead if non-allocation path.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c | 23 ++++++++++++++++++-----
 fs/fat/dir.c   |  2 +-
 fs/fat/fat.h   |  6 ++++--
 fs/fat/inode.c |  4 ++--
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 589edde9053..b4260229808 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -293,10 +293,12 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 }
 
 int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-	     unsigned long *mapped_blocks)
+	     unsigned long *mapped_blocks, int create)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
 	sector_t last_block;
 	int cluster, offset;
 
@@ -309,10 +311,21 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
 		}
 		return 0;
 	}
-	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
-		>> sb->s_blocksize_bits;
-	if (sector >= last_block)
-		return 0;
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (!create)
+			return 0;
+
+		/*
+		 * ->mmu_private can access on only allocation path.
+		 * (caller must hold ->i_mutex)
+		 */
+		last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+			>> blocksize_bits;
+		if (sector >= last_block)
+			return 0;
+	}
 
 	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
 	offset  = sector & (sbi->sec_per_clus - 1);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 140fc39e230..2ecaa17acdb 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a69f7f9757c..4efc5038ed2 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -91,7 +91,9 @@ struct msdos_inode_info {
 	/* for avoiding the race between fat_free() and fat_get_cluster() */
 	unsigned int cache_valid_id;
 
-	loff_t mmu_private;
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;	/* physically allocated size */
+
 	int i_start;		/* first cluster or 0 */
 	int i_logstart;		/* logical first cluster */
 	int i_attrs;		/* unused attribute bits */
@@ -222,7 +224,7 @@ extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
 extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-		    unsigned long *mapped_blocks);
+		    unsigned long *mapped_blocks, int create);
 
 /* fat/dir.c */
 extern const struct file_operations fat_dir_operations;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index be88208b83a..9e37ad93c73 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -64,7 +64,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	sector_t phys;
 	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
 	if (err)
 		return err;
 	if (phys) {
@@ -94,7 +94,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	*max_blocks = min(mapped_blocks, *max_blocks);
 	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 9ca59f4c3d28df14a1545a1e2832f34a0a50e3ed Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:57 -0800
Subject: fat: ->i_pos race fix

i_pos is 64bits value, hence it's not atomic to update.

Important place is fat_write_inode() only, other places without lock
are just for printk().

This adds lock for "BITS_PER_LONG == 32" kernel.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9e37ad93c73..bdd8fb7be2c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -542,6 +542,20 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+				    struct inode *inode)
+{
+	loff_t i_pos;
+#if BITS_PER_LONG == 32
+	spin_lock(&sbi->inode_hash_lock);
+#endif
+	i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+	spin_unlock(&sbi->inode_hash_lock);
+#endif
+	return i_pos;
+}
+
 static int fat_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
@@ -551,9 +565,12 @@ static int fat_write_inode(struct inode *inode, int wait)
 	loff_t i_pos;
 	int err;
 
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
 retry:
-	i_pos = MSDOS_I(inode)->i_pos;
-	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
+	i_pos = fat_i_pos_read(sbi, inode);
+	if (!i_pos)
 		return 0;
 
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
-- 
cgit v1.2.3


From c3302931db090d87e9015c3a7ce5c97a7dd90f78 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:58 -0800
Subject: fat: i_blocks warning fix

blkcnt_t type depends on CONFIG_LSF. Use unsigned long long always for
printk().  But lazy to type it, so add "llu" and use it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c    | 2 +-
 fs/fat/fat.h    | 3 +++
 fs/fat/fatent.c | 5 ++---
 fs/fat/misc.c   | 5 +++--
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 2ecaa17acdb..67e05835709 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -86,7 +86,7 @@ next:
 	*bh = sb_bread(sb, phys);
 	if (*bh == NULL) {
 		printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
-		       (unsigned long long)phys);
+		       (llu)phys);
 		/* skip this block */
 		*pos = (iblock + 1) << sb->s_blocksize_bits;
 		goto next;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 4efc5038ed2..ea440d65819 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,4 +323,7 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 int fat_cache_init(void);
 void fat_cache_destroy(void);
 
+/* helper for printk */
+typedef unsigned long long	llu;
+
 #endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 13513992da3..da6eea47872 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -93,8 +93,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
 	brelse(bhs[0]);
 err:
-	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-	       (unsigned long long)blocknr);
+	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
 	return -EIO;
 }
 
@@ -107,7 +106,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
 		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-		       (unsigned long long)blocknr);
+		       (llu)blocknr);
 		return -EIO;
 	}
 	fatent->nr_bhs = 1;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a191e79e66a..ac39ebcc149 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -124,8 +124,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 			mark_inode_dirty(inode);
 	}
 	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-		fat_fs_panic(sb, "clusters badly computed (%d != %lu)",
-			new_fclus, inode->i_blocks >> (sbi->cluster_bits - 9));
+		fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+			     new_fclus,
+			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
 		fat_cache_inval_inode(inode);
 	}
 	inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
-- 
cgit v1.2.3


From ed9b3e3379731e9f9d2f73f3d7fd9e7d2ce3df4a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 7 Nov 2008 09:06:45 -0500
Subject: ext4: Mark the buffer_heads as dirty and uptodate after prepare_write

We need to make sure we mark the buffer_heads as dirty and uptodate
so that block_write_full_page write them correctly.

This fixes mmap corruptions that can occur in low memory situations.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a130b56f1c..be21a5ae33c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2329,6 +2329,8 @@ static int ext4_da_writepage(struct page *page,
 			unlock_page(page);
 			return 0;
 		}
+		/* now mark the buffer_heads as dirty and uptodate */
+		block_commit_write(page, 0, PAGE_CACHE_SIZE);
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-- 
cgit v1.2.3


From 23712a9c28b9f80a8cf70c8490358d5f562d2465 Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Fri, 7 Nov 2008 09:21:01 -0500
Subject: ext4: add checksum calculation when clearing UNINIT flag in
 ext4_new_inode

When initializing an uninitialized block group in ext4_new_inode(),
its block group checksum must be re-calculated.  This fixes a race
when several threads try to allocate a new inode in an UNINIT'd group.

There is some question whether we need to be initializing the block
bitmap in ext4_new_inode() at all, but for now, if we are going to
init the block group, let's eliminate the race.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fe34d74cfb1..2a117e286e5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -718,6 +718,8 @@ got:
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			free = ext4_free_blocks_after_init(sb, group, gdp);
 			gdp->bg_free_blocks_count = cpu_to_le16(free);
+			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+								gdp);
 		}
 		spin_unlock(sb_bgl_lock(sbi, group));
 
-- 
cgit v1.2.3


From b726e923ea4d216027e466aa602d914e4b4a63af Mon Sep 17 00:00:00 2001
From: Doug Nazar <nazard@dragoninc.ca>
Date: Wed, 5 Nov 2008 06:16:28 -0500
Subject: Fix nfsd truncation of readdir results

Commit 8d7c4203 "nfsd: fix failure to set eof in readdir in some
situations" introduced a bug: on a directory in an exported ext3
filesystem with dir_index unset, a READDIR will only return about 250
entries, even if the directory was larger.

Bisected it back to this commit; reverting it fixes the problem.

It turns out that in this case ext3 reads a block at a time, then
returns from readdir, which means we can end up with buf.full==0 but
with more entries in the directory still to be read.  Before 8d7c4203
(but after c002a6c797 "Optimise NFS readdir hack slightly"), this would
cause us to return the READDIR result immediately, but with the eof bit
unset.  That could cause a performance regression (because the client
would need more roundtrips to the server to read the whole directory),
but no loss in correctness, since the cleared eof bit caused the client
to send another readdir.  After 8d7c4203, the setting of the eof bit
made this a correctness problem.

So, move nfserr_eof into the loop and remove the buf.full check so that
we loop until buf.used==0.  The following seems to do the right thing
and reduces the network traffic since we don't return a READDIR result
until the buffer is full.

Tested on an empty directory & large directory; eof is properly sent and
there are no more short buffers.

Signed-off-by: Doug Nazar <nazard@dragoninc.ca>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 848a03e83a4..4433c8f0016 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1875,11 +1875,11 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		return -ENOMEM;
 
 	offset = *offsetp;
-	cdp->err = nfserr_eof; /* will be cleared on successful read */
 
 	while (1) {
 		unsigned int reclen;
 
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
 		buf.used = 0;
 		buf.full = 0;
 
@@ -1912,9 +1912,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
 		offset = vfs_llseek(file, 0, SEEK_CUR);
-		cdp->err = nfserr_eof;
-		if (!buf.full)
-			break;
 	}
 
  done:
-- 
cgit v1.2.3