From c1e817d03a7de57a963654c35e6e80af9a5dbff5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 22 Jul 2008 22:58:03 +0100
Subject: GFS2: Fix debugfs glock file iterator

Due to an incorrect iterator, some glocks were being missed from the
glock dumps obtained via debugfs. This patch fixes the problem and
ensures that we don't miss any glocks in future.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e54661..4cbb6957a0d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1816,15 +1816,17 @@ restart:
 	if (gl) {
 		gi->gl = hlist_entry(gl->gl_list.next,
 				     struct gfs2_glock, gl_list);
-		if (gi->gl)
-			gfs2_glock_hold(gi->gl);
+	} else {
+		gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+				     struct gfs2_glock, gl_list);
 	}
+	if (gi->gl)
+		gfs2_glock_hold(gi->gl);
 	read_unlock(gl_lock_addr(gi->hash));
 	if (gl)
 		gfs2_glock_put(gl);
-	if (gl && gi->gl == NULL)
-		gi->hash++;
 	while (gi->gl == NULL) {
+		gi->hash++;
 		if (gi->hash >= GFS2_GL_HASH_SIZE)
 			return 1;
 		read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
 		if (gi->gl)
 			gfs2_glock_hold(gi->gl);
 		read_unlock(gl_lock_addr(gi->hash));
-		gi->hash++;
 	}
 
 	if (gi->sdp != gi->gl->gl_sbd)
-- 
cgit v1.2.3


From 9b8df98fc8973ad1c5f0d7c4cf71c7fb84fe22c5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 8 Aug 2008 13:45:13 +0100
Subject: GFS2: Fix metafs mounts

This patch is intended to fix the issues reported in bz #457798. Instead
of having the metafs as a separate filesystem, it becomes a second root
of gfs2. As a result it will appear as type gfs2 in /proc/mounts, but it
is still possible (for backwards compatibility purposes) to mount it as
type gfs2meta. A new mount flag "meta" is introduced so that its possible
to tell the two cases apart in /proc/mounts.

As a result it becomes possible to mount type gfs2 with -o meta and
get the same result as mounting type gfs2meta. So it is possible to
mount just the metafs on its own. Currently if you do this, its then
impossible to mount the "normal" root of the gfs2 filesystem without
first unmounting the metafs root. I'm not sure if thats a feature or
a bug :-)

Either way, this is a great improvement on the previous scheme and I've
verified that it works ok with bind mounts on both the "normal" root
and the metafs root in various combinations.

There were also a bunch of functions in super.c which didn't belong there,
so this moves them into ops_fstype.c where they can be static. Hopefully
the mount/umount sequence is now more obvious as a result.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Alexander Viro <aviro@redhat.com>
---
 fs/gfs2/incore.h     |   7 +-
 fs/gfs2/mount.c      |   7 +
 fs/gfs2/ops_fstype.c | 566 +++++++++++++++++++++++++++++++++++----------------
 fs/gfs2/ops_super.c  |  57 +++++-
 fs/gfs2/super.c      | 340 -------------------------------
 fs/gfs2/super.h      |   6 -
 6 files changed, 448 insertions(+), 535 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c46..a1777a1927b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -400,6 +400,7 @@ struct gfs2_args {
 	int ar_quota; /* off/account/on */
 	int ar_suiddir; /* suiddir support */
 	int ar_data; /* ordered/writeback */
+	int ar_meta; /* mount metafs */
 };
 
 struct gfs2_tune {
@@ -461,7 +462,6 @@ struct gfs2_sb_host {
 
 struct gfs2_sbd {
 	struct super_block *sd_vfs;
-	struct super_block *sd_vfs_meta;
 	struct kobject sd_kobj;
 	unsigned long sd_flags;	/* SDF_... */
 	struct gfs2_sb_host sd_sb;
@@ -499,7 +499,9 @@ struct gfs2_sbd {
 
 	/* Inode Stuff */
 
-	struct inode *sd_master_dir;
+	struct dentry *sd_master_dir;
+	struct dentry *sd_root_dir;
+
 	struct inode *sd_jindex;
 	struct inode *sd_inum_inode;
 	struct inode *sd_statfs_inode;
@@ -634,7 +636,6 @@ struct gfs2_sbd {
 	/* Debugging crud */
 
 	unsigned long sd_last_warning;
-	struct vfsmount *sd_gfs2mnt;
 	struct dentry *debugfs_dir;    /* debugfs directory */
 	struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f95..df48333e6f0 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,6 +42,7 @@ enum {
 	Opt_nosuiddir,
 	Opt_data_writeback,
 	Opt_data_ordered,
+	Opt_meta,
 	Opt_err,
 };
 
@@ -66,6 +67,7 @@ static match_table_t tokens = {
 	{Opt_nosuiddir, "nosuiddir"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
 	{Opt_err, NULL}
 };
 
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 		case Opt_data_ordered:
 			args->ar_data = GFS2_DATA_ORDERED;
 			break;
+		case Opt_meta:
+			if (remount && args->ar_meta != 1)
+				goto cant_remount;
+			args->ar_meta = 1;
+			break;
 		case Opt_err:
 		default:
 			fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d649063..a6225cce2cb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,45 @@
 #define DO 0
 #define UNDO 1
 
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+	spin_lock_init(&gt->gt_spin);
+
+	gt->gt_demote_secs = 300;
+	gt->gt_incore_log_blocks = 1024;
+	gt->gt_log_flush_secs = 60;
+	gt->gt_recoverd_secs = 60;
+	gt->gt_logd_secs = 1;
+	gt->gt_quotad_secs = 5;
+	gt->gt_quota_simul_sync = 64;
+	gt->gt_quota_warn_period = 10;
+	gt->gt_quota_scale_num = 1;
+	gt->gt_quota_scale_den = 1;
+	gt->gt_quota_cache_secs = 300;
+	gt->gt_quota_quantum = 60;
+	gt->gt_atime_quantum = 3600;
+	gt->gt_new_files_jdata = 0;
+	gt->gt_max_readahead = 1 << 18;
+	gt->gt_stall_secs = 600;
+	gt->gt_complain_secs = 10;
+	gt->gt_statfs_quantum = 30;
+	gt->gt_statfs_slow = 0;
+}
+
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp;
@@ -113,6 +152,272 @@ static void init_vfs(struct super_block *sb, unsigned noatime)
 	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
 }
 
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
+{
+	unsigned int x;
+
+	if (sb->sb_magic != GFS2_MAGIC ||
+	    sb->sb_type != GFS2_METATYPE_SB) {
+		if (!silent)
+			printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+		return -EINVAL;
+	}
+
+	/*  If format numbers match exactly, we're done.  */
+
+	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+		return 0;
+
+	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+		for (x = 0; gfs2_old_fs_formats[x]; x++)
+			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+				break;
+
+		if (!gfs2_old_fs_formats[x]) {
+			printk(KERN_WARNING
+			       "GFS2: code version (%u, %u) is incompatible "
+			       "with ondisk format (%u, %u)\n",
+			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+			       sb->sb_fs_format, sb->sb_multihost_format);
+			printk(KERN_WARNING
+			       "GFS2: I don't know how to upgrade this FS\n");
+			return -EINVAL;
+		}
+	}
+
+	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+		for (x = 0; gfs2_old_multihost_formats[x]; x++)
+			if (gfs2_old_multihost_formats[x] ==
+			    sb->sb_multihost_format)
+				break;
+
+		if (!gfs2_old_multihost_formats[x]) {
+			printk(KERN_WARNING
+			       "GFS2: code version (%u, %u) is incompatible "
+			       "with ondisk format (%u, %u)\n",
+			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+			       sb->sb_fs_format, sb->sb_multihost_format);
+			printk(KERN_WARNING
+			       "GFS2: I don't know how to upgrade this FS\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!sdp->sd_args.ar_upgrade) {
+		printk(KERN_WARNING
+		       "GFS2: code version (%u, %u) is incompatible "
+		       "with ondisk format (%u, %u)\n",
+		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+		       sb->sb_fs_format, sb->sb_multihost_format);
+		printk(KERN_INFO
+		       "GFS2: Use the \"upgrade\" mount option to upgrade "
+		       "the FS\n");
+		printk(KERN_INFO "GFS2: See the manual for more details\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void end_bio_io_page(struct bio *bio, int error)
+{
+	struct page *page = bio->bi_private;
+
+	if (!error)
+		SetPageUptodate(page);
+	else
+		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+	unlock_page(page);
+}
+
+static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+{
+	const struct gfs2_sb *str = buf;
+
+	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+	sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+	sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+	sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+	sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+	sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+	sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+
+	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_sb *p;
+	struct page *page;
+	struct bio *bio;
+
+	page = alloc_page(GFP_NOFS);
+	if (unlikely(!page))
+		return -ENOBUFS;
+
+	ClearPageUptodate(page);
+	ClearPageDirty(page);
+	lock_page(page);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	if (unlikely(!bio)) {
+		__free_page(page);
+		return -ENOBUFS;
+	}
+
+	bio->bi_sector = sector * (sb->s_blocksize >> 9);
+	bio->bi_bdev = sb->s_bdev;
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+
+	bio->bi_end_io = end_bio_io_page;
+	bio->bi_private = page;
+	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+	wait_on_page_locked(page);
+	bio_put(bio);
+	if (!PageUptodate(page)) {
+		__free_page(page);
+		return -EIO;
+	}
+	p = kmap(page);
+	gfs2_sb_in(&sdp->sd_sb, p);
+	kunmap(page);
+	__free_page(page);
+	return 0;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+	u32 hash_blocks, ind_blocks, leaf_blocks;
+	u32 tmp_blocks;
+	unsigned int x;
+	int error;
+
+	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+	if (error) {
+		if (!silent)
+			fs_err(sdp, "can't read superblock\n");
+		return error;
+	}
+
+	error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+	if (error)
+		return error;
+
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+			       GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_dinode)) / sizeof(u64);
+	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header)) /
+			        sizeof(struct gfs2_quota_change);
+
+	/* Compute maximum reservation required to add a entry to a directory */
+
+	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+			     sdp->sd_jbsize);
+
+	ind_blocks = 0;
+	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+		ind_blocks += tmp_blocks;
+	}
+
+	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_dinode);
+	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_heightsize[x - 1] || m)
+			break;
+		sdp->sd_heightsize[x] = space;
+	}
+	sdp->sd_max_height = x;
+	sdp->sd_heightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+				 sizeof(struct gfs2_dinode);
+	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_jheightsize[x - 1] || m)
+			break;
+		sdp->sd_jheightsize[x] = space;
+	}
+	sdp->sd_max_jheight = x;
+	sdp->sd_jheightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+	return 0;
+}
+
 static int init_names(struct gfs2_sbd *sdp, int silent)
 {
 	char *proto, *table;
@@ -224,51 +529,59 @@ fail:
 	return error;
 }
 
-static inline struct inode *gfs2_lookup_root(struct super_block *sb,
-					     u64 no_addr)
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
+			    u64 no_addr, const char *name)
 {
-	return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct dentry *dentry;
+	struct inode *inode;
+
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	if (IS_ERR(inode)) {
+		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+		return PTR_ERR(inode);
+	}
+	dentry = d_alloc_root(inode);
+	if (!dentry) {
+		fs_err(sdp, "can't alloc %s dentry\n", name);
+		iput(inode);
+		return -ENOMEM;
+	}
+	dentry->d_op = &gfs2_dops;
+	*dptr = dentry;
+	return 0;
 }
 
-static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+static int init_sb(struct gfs2_sbd *sdp, int silent)
 {
 	struct super_block *sb = sdp->sd_vfs;
 	struct gfs2_holder sb_gh;
 	u64 no_addr;
-	struct inode *inode;
-	int error = 0;
+	int ret;
 
-	if (undo) {
-		if (sb->s_root) {
-			dput(sb->s_root);
-			sb->s_root = NULL;
-		}
-		return 0;
+	ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+				LM_ST_SHARED, 0, &sb_gh);
+	if (ret) {
+		fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
+		return ret;
 	}
 
-	error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
-				 LM_ST_SHARED, 0, &sb_gh);
-	if (error) {
-		fs_err(sdp, "can't acquire superblock glock: %d\n", error);
-		return error;
-	}
-
-	error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
-	if (error) {
-		fs_err(sdp, "can't read superblock: %d\n", error);
+	ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+	if (ret) {
+		fs_err(sdp, "can't read superblock: %d\n", ret);
 		goto out;
 	}
 
 	/* Set up the buffer cache and SB for real */
 	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
-		error = -EINVAL;
+		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
 		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
-		error = -EINVAL;
+		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too big for machine "
 		       "page size (%u)\n",
 		       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +591,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
 
 	/* Get the root inode */
 	no_addr = sdp->sd_sb.sb_root_dir.no_addr;
-	if (sb->s_type == &gfs2meta_fs_type)
-		no_addr = sdp->sd_sb.sb_master_dir.no_addr;
-	inode = gfs2_lookup_root(sb, no_addr);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		fs_err(sdp, "can't read in root inode: %d\n", error);
+	ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
+	if (ret)
 		goto out;
-	}
 
-	sb->s_root = d_alloc_root(inode);
-	if (!sb->s_root) {
-		fs_err(sdp, "can't get root dentry\n");
-		error = -ENOMEM;
-		iput(inode);
-	} else
-		sb->s_root->d_op = &gfs2_dops;
-	
+	/* Get the master inode */
+	no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+	ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
+	if (ret) {
+		dput(sdp->sd_root_dir);
+		goto out;
+	}
+	sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
 out:
 	gfs2_glock_dq_uninit(&sb_gh);
-	return error;
+	return ret;
 }
 
 /**
@@ -372,6 +680,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
+	struct inode *master = sdp->sd_master_dir->d_inode;
 	struct gfs2_holder ji_gh;
 	struct task_struct *p;
 	struct gfs2_inode *ip;
@@ -383,7 +692,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		goto fail_recoverd;
 	}
 
-	sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
 	if (IS_ERR(sdp->sd_jindex)) {
 		fs_err(sdp, "can't lookup journal index: %d\n", error);
 		return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +815,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
 	int error = 0;
 	struct gfs2_inode *ip;
-	struct inode *inode;
+	struct inode *master = sdp->sd_master_dir->d_inode;
 
 	if (undo)
 		goto fail_qinode;
 
-	inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		fs_err(sdp, "can't read in master directory: %d\n", error);
-		goto fail;
-	}
-	sdp->sd_master_dir = inode;
-
 	error = init_journal(sdp, undo);
 	if (error)
-		goto fail_master;
+		goto fail;
 
 	/* Read in the master inode number inode */
-	sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+	sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
 	if (IS_ERR(sdp->sd_inum_inode)) {
 		error = PTR_ERR(sdp->sd_inum_inode);
 		fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +834,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 
 
 	/* Read in the master statfs inode */
-	sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
 	if (IS_ERR(sdp->sd_statfs_inode)) {
 		error = PTR_ERR(sdp->sd_statfs_inode);
 		fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +842,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	}
 
 	/* Read in the resource index inode */
-	sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+	sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
 	if (IS_ERR(sdp->sd_rindex)) {
 		error = PTR_ERR(sdp->sd_rindex);
 		fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +853,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
-	sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+	sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
 	if (IS_ERR(sdp->sd_quota_inode)) {
 		error = PTR_ERR(sdp->sd_quota_inode);
 		fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +872,6 @@ fail_inum:
 	iput(sdp->sd_inum_inode);
 fail_journal:
 	init_journal(sdp, UNDO);
-fail_master:
-	iput(sdp->sd_master_dir);
 fail:
 	return error;
 }
@@ -583,6 +882,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	char buf[30];
 	int error = 0;
 	struct gfs2_inode *ip;
+	struct inode *master = sdp->sd_master_dir->d_inode;
 
 	if (sdp->sd_args.ar_spectator)
 		return 0;
@@ -590,7 +890,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	if (undo)
 		goto fail_qc_gh;
 
-	pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+	pn = gfs2_lookup_simple(master, "per_node");
 	if (IS_ERR(pn)) {
 		error = PTR_ERR(pn);
 		fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -828,7 +1128,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	if (error)
 		goto fail_lm;
 
-	error = init_sb(sdp, silent, DO);
+	error = init_sb(sdp, silent);
 	if (error)
 		goto fail_locking;
 
@@ -869,7 +1169,11 @@ fail_per_node:
 fail_inodes:
 	init_inodes(sdp, UNDO);
 fail_sb:
-	init_sb(sdp, 0, UNDO);
+	if (sdp->sd_root_dir)
+		dput(sdp->sd_root_dir);
+	if (sdp->sd_master_dir)
+		dput(sdp->sd_master_dir);
+	sb->s_root = NULL;
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
@@ -887,151 +1191,60 @@ fail:
 }
 
 static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data, struct vfsmount *mnt)
+		       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *sb;
-	struct gfs2_sbd *sdp;
-	int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
-	if (error)
-		goto out;
-	sb = mnt->mnt_sb;
-	sdp = sb->s_fs_info;
-	sdp->sd_gfs2mnt = mnt;
-out:
-	return error;
+	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
 
-static int fill_super_meta(struct super_block *sb, struct super_block *new,
-			   void *data, int silent)
+static struct super_block *get_gfs2_sb(const char *dev_name)
 {
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct inode *inode;
-	int error = 0;
-
-	new->s_fs_info = sdp;
-	sdp->sd_vfs_meta = sb;
-
-	init_vfs(new, SDF_NOATIME);
-
-        /* Get the master inode */
-	inode = igrab(sdp->sd_master_dir);
-
-	new->s_root = d_alloc_root(inode);
-	if (!new->s_root) {
-		fs_err(sdp, "can't get root dentry\n");
-		error = -ENOMEM;
-		iput(inode);
-	} else
-		new->s_root->d_op = &gfs2_dops;
-
-	return error;
-}
-
-static int set_bdev_super(struct super_block *s, void *data)
-{
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	return 0;
-}
-
-static int test_bdev_super(struct super_block *s, void *data)
-{
-	return s->s_bdev == data;
-}
-
-static struct super_block* get_gfs2_sb(const char *dev_name)
-{
-	struct kstat stat;
+	struct super_block *sb;
 	struct nameidata nd;
-	struct super_block *sb = NULL, *s;
 	int error;
 
 	error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
 	if (error) {
-		printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
-		       dev_name);
-		goto out;
-	}
-	error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-
-	list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
-		if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
-		    (S_ISDIR(stat.mode) &&
-		     s == nd.path.dentry->d_inode->i_sb)) {
-			sb = s;
-			goto free_nd;
-		}
+		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
+		       dev_name, error);
+		return NULL;
 	}
-
-	printk(KERN_WARNING "GFS2: Unrecognized block device or "
-	       "mount point %s\n", dev_name);
-
-free_nd:
+	sb = nd.path.dentry->d_inode->i_sb;
+	if (sb && (sb->s_type == &gfs2_fs_type))
+		atomic_inc(&sb->s_active);
+	else
+		sb = NULL;
 	path_put(&nd.path);
-out:
 	return sb;
 }
 
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	int error = 0;
-	struct super_block *sb = NULL, *new;
+	struct super_block *sb = NULL;
 	struct gfs2_sbd *sdp;
 
 	sb = get_gfs2_sb(dev_name);
 	if (!sb) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		error = -ENOENT;
-		goto error;
+		return -ENOENT;
 	}
 	sdp = sb->s_fs_info;
-	if (sdp->sd_vfs_meta) {
-		printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
-		error = -EBUSY;
-		goto error;
-	}
-	down(&sb->s_bdev->bd_mount_sem);
-	new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
-	up(&sb->s_bdev->bd_mount_sem);
-	if (IS_ERR(new)) {
-		error = PTR_ERR(new);
-		goto error;
-	}
-	new->s_flags = flags;
-	strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
-	sb_set_blocksize(new, sb->s_blocksize);
-	error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&new->s_umount);
-		deactivate_super(new);
-		goto error;
-	}
-
-	new->s_flags |= MS_ACTIVE;
-
-	/* Grab a reference to the gfs2 mount point */
-	atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
-	return simple_set_mnt(mnt, new);
-error:
-	return error;
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sdp->sd_master_dir);
+	return 0;
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
-{
-	if (sb->s_fs_info) {
-		gfs2_delete_debugfs_file(sb->s_fs_info);
-		gfs2_meta_syncfs(sb->s_fs_info);
-	}
-	kill_block_super(sb);
-}
-
-static void gfs2_kill_sb_meta(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	generic_shutdown_super(sb);
-	sdp->sd_vfs_meta = NULL;
-	atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+	gfs2_meta_syncfs(sdp);
+	dput(sdp->sd_root_dir);
+	dput(sdp->sd_master_dir);
+	sdp->sd_root_dir = NULL;
+	sdp->sd_master_dir = NULL;
+	shrink_dcache_sb(sb);
+	kill_block_super(sb);
+	gfs2_delete_debugfs_file(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1259,6 @@ struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
 	.fs_flags = FS_REQUIRES_DEV,
 	.get_sb = gfs2_get_sb_meta,
-	.kill_sb = gfs2_kill_sb_meta,
 	.owner = THIS_MODULE,
 };
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a35..8f332d26b5d 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -62,6 +62,39 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 	return 0;
 }
 
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	gfs2_quota_sync(sdp);
+	gfs2_statfs_sync(sdp);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
 /**
  * gfs2_put_super - Unmount the filesystem
  * @sb: The VFS superblock
@@ -73,12 +106,6 @@ static void gfs2_put_super(struct super_block *sb)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
-	if (!sdp)
-		return;
-
-	if (!strncmp(sb->s_type->name, "gfs2meta", 8))
-		return; /* Nothing to do */
-
 	/*  Unfreeze the filesystem, if we need to  */
 
 	mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +128,6 @@ static void gfs2_put_super(struct super_block *sb)
 
 	/*  Release stuff  */
 
-	iput(sdp->sd_master_dir);
 	iput(sdp->sd_jindex);
 	iput(sdp->sd_inum_inode);
 	iput(sdp->sd_statfs_inode);
@@ -152,6 +178,7 @@ static void gfs2_write_super(struct super_block *sb)
  *
  * Flushes the log to disk.
  */
+
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
 	sb->s_dirt = 0;
@@ -295,6 +322,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  * inode's blocks, or alternatively pass the baton on to another
  * node for later deallocation.
  */
+
 static void gfs2_drop_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +361,16 @@ static void gfs2_clear_inode(struct inode *inode)
 	}
 }
 
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
 /**
  * gfs2_show_options - Show mount options for /proc/mounts
  * @s: seq_file structure
@@ -346,6 +384,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
 
+	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+		seq_printf(s, ",meta");
 	if (args->ar_lockproto[0])
 		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
 	if (args->ar_locktable[0])
@@ -414,6 +454,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
  * conversion on the iopen lock, but we can change that later. This
  * is safe, just less efficient.
  */
+
 static void gfs2_delete_inode(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +519,6 @@ out:
 	clear_inode(inode);
 }
 
-
-
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
 {
 	struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca831991cbc..c3ba3d9d0aa 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
 #include "trans.h"
 #include "util.h"
 
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
-/**
- * gfs2_tune_init - Fill a gfs2_tune structure with default values
- * @gt: tune
- *
- */
-
-void gfs2_tune_init(struct gfs2_tune *gt)
-{
-	spin_lock_init(&gt->gt_spin);
-
-	gt->gt_demote_secs = 300;
-	gt->gt_incore_log_blocks = 1024;
-	gt->gt_log_flush_secs = 60;
-	gt->gt_recoverd_secs = 60;
-	gt->gt_logd_secs = 1;
-	gt->gt_quotad_secs = 5;
-	gt->gt_quota_simul_sync = 64;
-	gt->gt_quota_warn_period = 10;
-	gt->gt_quota_scale_num = 1;
-	gt->gt_quota_scale_den = 1;
-	gt->gt_quota_cache_secs = 300;
-	gt->gt_quota_quantum = 60;
-	gt->gt_atime_quantum = 3600;
-	gt->gt_new_files_jdata = 0;
-	gt->gt_max_readahead = 1 << 18;
-	gt->gt_stall_secs = 600;
-	gt->gt_complain_secs = 10;
-	gt->gt_statfs_quantum = 30;
-	gt->gt_statfs_slow = 0;
-}
-
-/**
- * gfs2_check_sb - Check superblock
- * @sdp: the filesystem
- * @sb: The superblock
- * @silent: Don't print a message if the check fails
- *
- * Checks the version code of the FS is one that we understand how to
- * read and that the sizes of the various on-disk structures have not
- * changed.
- */
-
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
-{
-	unsigned int x;
-
-	if (sb->sb_magic != GFS2_MAGIC ||
-	    sb->sb_type != GFS2_METATYPE_SB) {
-		if (!silent)
-			printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
-		return -EINVAL;
-	}
-
-	/*  If format numbers match exactly, we're done.  */
-
-	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
-	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
-		return 0;
-
-	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-		for (x = 0; gfs2_old_fs_formats[x]; x++)
-			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-				break;
-
-		if (!gfs2_old_fs_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-		for (x = 0; gfs2_old_multihost_formats[x]; x++)
-			if (gfs2_old_multihost_formats[x] ==
-			    sb->sb_multihost_format)
-				break;
-
-		if (!gfs2_old_multihost_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (!sdp->sd_args.ar_upgrade) {
-		printk(KERN_WARNING
-		       "GFS2: code version (%u, %u) is incompatible "
-		       "with ondisk format (%u, %u)\n",
-		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-		       sb->sb_fs_format, sb->sb_multihost_format);
-		printk(KERN_INFO
-		       "GFS2: Use the \"upgrade\" mount option to upgrade "
-		       "the FS\n");
-		printk(KERN_INFO "GFS2: See the manual for more details\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-
-static void end_bio_io_page(struct bio *bio, int error)
-{
-	struct page *page = bio->bi_private;
-
-	if (!error)
-		SetPageUptodate(page);
-	else
-		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
-	unlock_page(page);
-}
-
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
-{
-	const struct gfs2_sb *str = buf;
-
-	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
-	sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
-	sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
-	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
-	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
-	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
-	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
-	sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
-	sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
-	sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
-	sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
-
-	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
-	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-}
-
-/**
- * gfs2_read_super - Read the gfs2 super block from disk
- * @sdp: The GFS2 super block
- * @sector: The location of the super block
- * @error: The error code to return
- *
- * This uses the bio functions to read the super block from disk
- * because we want to be 100% sure that we never read cached data.
- * A super block is read twice only during each GFS2 mount and is
- * never written to by the filesystem. The first time its read no
- * locks are held, and the only details which are looked at are those
- * relating to the locking protocol. Once locking is up and working,
- * the sb is read again under the lock to establish the location of
- * the master directory (contains pointers to journals etc) and the
- * root directory.
- *
- * Returns: 0 on success or error
- */
-
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
-{
-	struct super_block *sb = sdp->sd_vfs;
-	struct gfs2_sb *p;
-	struct page *page;
-	struct bio *bio;
-
-	page = alloc_page(GFP_NOFS);
-	if (unlikely(!page))
-		return -ENOBUFS;
-
-	ClearPageUptodate(page);
-	ClearPageDirty(page);
-	lock_page(page);
-
-	bio = bio_alloc(GFP_NOFS, 1);
-	if (unlikely(!bio)) {
-		__free_page(page);
-		return -ENOBUFS;
-	}
-
-	bio->bi_sector = sector * (sb->s_blocksize >> 9);
-	bio->bi_bdev = sb->s_bdev;
-	bio_add_page(bio, page, PAGE_SIZE, 0);
-
-	bio->bi_end_io = end_bio_io_page;
-	bio->bi_private = page;
-	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
-	wait_on_page_locked(page);
-	bio_put(bio);
-	if (!PageUptodate(page)) {
-		__free_page(page);
-		return -EIO;
-	}
-	p = kmap(page);
-	gfs2_sb_in(&sdp->sd_sb, p);
-	kunmap(page);
-	__free_page(page);
-	return 0;
-}
-
-/**
- * gfs2_read_sb - Read super block
- * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
- * @silent: Don't print message if mount fails
- *
- */
-
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
-{
-	u32 hash_blocks, ind_blocks, leaf_blocks;
-	u32 tmp_blocks;
-	unsigned int x;
-	int error;
-
-	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
-	if (error) {
-		if (!silent)
-			fs_err(sdp, "can't read superblock\n");
-		return error;
-	}
-
-	error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-	if (error)
-		return error;
-
-	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
-			       GFS2_BASIC_BLOCK_SHIFT;
-	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
-			  sizeof(struct gfs2_dinode)) / sizeof(u64);
-	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
-			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
-	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
-	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
-	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
-	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
-	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
-				sizeof(struct gfs2_meta_header)) /
-			        sizeof(struct gfs2_quota_change);
-
-	/* Compute maximum reservation required to add a entry to a directory */
-
-	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
-			     sdp->sd_jbsize);
-
-	ind_blocks = 0;
-	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
-		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
-		ind_blocks += tmp_blocks;
-	}
-
-	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
-
-	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
-
-	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
-				sizeof(struct gfs2_dinode);
-	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
-	for (x = 2;; x++) {
-		u64 space, d;
-		u32 m;
-
-		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
-		d = space;
-		m = do_div(d, sdp->sd_inptrs);
-
-		if (d != sdp->sd_heightsize[x - 1] || m)
-			break;
-		sdp->sd_heightsize[x] = space;
-	}
-	sdp->sd_max_height = x;
-	sdp->sd_heightsize[x] = ~0;
-	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
-
-	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
-				 sizeof(struct gfs2_dinode);
-	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
-	for (x = 2;; x++) {
-		u64 space, d;
-		u32 m;
-
-		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
-		d = space;
-		m = do_div(d, sdp->sd_inptrs);
-
-		if (d != sdp->sd_jheightsize[x - 1] || m)
-			break;
-		sdp->sd_jheightsize[x] = space;
-	}
-	sdp->sd_max_jheight = x;
-	sdp->sd_jheightsize[x] = ~0;
-	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-
-	return 0;
-}
-
 /**
  * gfs2_jindex_hold - Grab a lock on the jindex
  * @sdp: The GFS2 superblock
@@ -581,39 +274,6 @@ fail:
 	return error;
 }
 
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-	struct gfs2_holder t_gh;
-	int error;
-
-	gfs2_quota_sync(sdp);
-	gfs2_statfs_sync(sdp);
-
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-				   &t_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return error;
-
-	gfs2_meta_syncfs(sdp);
-	gfs2_log_shutdown(sdp);
-
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
-	if (t_gh.gh_gl)
-		gfs2_glock_dq_uninit(&t_gh);
-
-	gfs2_quota_cleanup(sdp);
-
-	return error;
-}
-
 static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
 	const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f..50a4c9b1215 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
 
 #include "incore.h"
 
-void gfs2_tune_init(struct gfs2_tune *gt);
-
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
 			      struct gfs2_inode **ipp);
 
 int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 
 int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
-- 
cgit v1.2.3


From 72dbf4790fc6736f9cb54424245114acf0b0038c Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 12 Aug 2008 13:39:29 -0500
Subject: GFS2: rm on multiple nodes causes panic

This patch fixes a problem whereby simultaneous unlink, rmdir,
rename and link operations (e.g. rm -fR *) from multiple nodes
on the same GFS2 file system can cause kernel panics, hangs,
and/or memory corruption.  It also gets rid of all the non-rgrp
calls to gfs2_glock_nq_m.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c | 56 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e2c62f73a77..35f6f032a02 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
 
-	error = gfs2_glock_nq_m(2, ghs);
+	error = gfs2_glock_nq(ghs); /* parent */
 	if (error)
-		goto out;
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
 
 	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
@@ -245,8 +249,10 @@ out_alloc:
 	if (alloc_required)
 		gfs2_alloc_put(dip);
 out_gunlock:
-	gfs2_glock_dq_m(2, ghs);
-out:
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_glock_dq(ghs);
+out_parent:
 	gfs2_holder_uninit(ghs);
 	gfs2_holder_uninit(ghs + 1);
 	if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
 	if (error)
-		goto out_rgrp;
+		goto out_gunlock;
 
 	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
 	if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 out_end_trans:
 	gfs2_trans_end(sdp);
+out_gunlock:
 	gfs2_glock_dq(ghs + 2);
 out_rgrp:
 	gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	struct gfs2_holder ri_gh;
 	int error;
 
-
 	error = gfs2_rindex_hold(sdp, &ri_gh);
 	if (error)
 		return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
 	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
 
-	error = gfs2_glock_nq_m(3, ghs);
+	error = gfs2_glock_nq(ghs); /* parent */
 	if (error)
-		goto out;
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = gfs2_glock_nq(ghs + 2); /* rgrp */
+	if (error)
+		goto out_rgrp;
 
 	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
 	if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	gfs2_trans_end(sdp);
 
 out_gunlock:
-	gfs2_glock_dq_m(3, ghs);
-out:
-	gfs2_holder_uninit(ghs);
-	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs + 2);
+out_rgrp:
 	gfs2_holder_uninit(ghs + 2);
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
 	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
@@ -639,9 +657,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
 	}
 
-	error = gfs2_glock_nq_m(num_gh, ghs);
-	if (error)
-		goto out_uninit;
+	for (x = 0; x < num_gh; x++) {
+		error = gfs2_glock_nq(ghs + x);
+		if (error)
+			goto out_gunlock;
+	}
 
 	/* Check out the old directory */
 
@@ -804,10 +824,10 @@ out_alloc:
 	if (alloc_required)
 		gfs2_alloc_put(ndip);
 out_gunlock:
-	gfs2_glock_dq_m(num_gh, ghs);
-out_uninit:
-	for (x = 0; x < num_gh; x++)
+	while (x--) {
+		gfs2_glock_dq(ghs + x);
 		gfs2_holder_uninit(ghs + x);
+	}
 out_gunlock_r:
 	if (dir_rename)
 		gfs2_glock_dq_uninit(&r_gh);
-- 
cgit v1.2.3


From db0badc58e948b810c7a75cfcc48845e2949ee37 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 18 Aug 2008 13:40:18 +0200
Subject: udf: Fix lock inversion between iprune_mutex and alloc_mutex (v2)

A memory allocation inside alloc_mutex must not recurse back into the
filesystem itself because that leads to lock inversion between iprune_mutex and
alloc_mutex (and thus to deadlocks - see traces below). alloc_mutex is actually
needed only to update allocation statistics in the superblock so we can drop it
before we start allocating memory for the inode.

tar           D ffff81015b9c8c90     0  6614   6612
 ffff8100d5a21a20 0000000000000086 0000000000000000 00000000ffff0000
 ffff81015b9c8c90 ffff81015b8f0cd0 ffff81015b9c8ee0 0000000000000000
 0000000000000003 0000000000000000 0000000000000000 0000000000000000
Call Trace:
 [<ffffffff803c1d8a>] __mutex_lock_slowpath+0x64/0x9b
 [<ffffffff803c1bef>] mutex_lock+0xa/0xb
 [<ffffffff8027f8c2>] shrink_icache_memory+0x38/0x200
 [<ffffffff80257742>] shrink_slab+0xe3/0x15b
 [<ffffffff802579db>] try_to_free_pages+0x221/0x30d
 [<ffffffff8025657e>] isolate_pages_global+0x0/0x31
 [<ffffffff8025324b>] __alloc_pages_internal+0x252/0x3ab
 [<ffffffff8026b08b>] cache_alloc_refill+0x22e/0x47b
 [<ffffffff8026ae37>] kmem_cache_alloc+0x3b/0x61
 [<ffffffff8026b15b>] cache_alloc_refill+0x2fe/0x47b
 [<ffffffff8026b34e>] __kmalloc+0x76/0x9c
 [<ffffffffa00751f2>] :udf:udf_new_inode+0x202/0x2e2
 [<ffffffffa007ae5e>] :udf:udf_create+0x2f/0x16d
 [<ffffffffa0078f27>] :udf:udf_lookup+0xa6/0xad
...
kswapd0       D ffff81015b9d9270     0   125      2
 ffff81015b903c28 0000000000000046 ffffffff8028cbb0 00000000fffffffb
 ffff81015b9d9270 ffff81015b8f0cd0 ffff81015b9d94c0 000000000271b490
 ffffe2000271b458 ffffe2000271b420 ffffe20002728dc8 ffffe20002728d90
Call Trace:
 [<ffffffff8028cbb0>] __set_page_dirty+0xeb/0xf5
 [<ffffffff8025403a>] get_dirty_limits+0x1d/0x22f
 [<ffffffff803c1d8a>] __mutex_lock_slowpath+0x64/0x9b
 [<ffffffff803c1bef>] mutex_lock+0xa/0xb
 [<ffffffffa0073f58>] :udf:udf_bitmap_free_blocks+0x47/0x1eb
 [<ffffffffa007df31>] :udf:udf_discard_prealloc+0xc6/0x172
 [<ffffffffa007875a>] :udf:udf_clear_inode+0x1e/0x48
 [<ffffffff8027f121>] clear_inode+0x6d/0xc4
 [<ffffffff8027f7f2>] dispose_list+0x56/0xee
 [<ffffffff8027fa5a>] shrink_icache_memory+0x1d0/0x200
 [<ffffffff80257742>] shrink_slab+0xe3/0x15b
 [<ffffffff80257e93>] kswapd+0x346/0x447
...

Reported-by: Tibor Tajti <tibor.tajti@gmail.com>
Reviewed-by: Ingo Oeser <ioe-lkml@rameria.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/ialloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3..c4943c8988c 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -111,6 +111,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		lvhd->uniqueID = cpu_to_le64(uniqueID);
 		mark_buffer_dirty(sbi->s_lvid_bh);
 	}
+	mutex_unlock(&sbi->s_alloc_mutex);
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
 	if (dir->i_mode & S_ISGID) {
@@ -145,7 +146,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	if (!iinfo->i_ext.i_data) {
 		iput(inode);
 		*err = -ENOMEM;
-		mutex_unlock(&sbi->s_alloc_mutex);
 		return NULL;
 	}
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
@@ -158,7 +158,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		iinfo->i_crtime = current_fs_time(inode->i_sb);
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
-	mutex_unlock(&sbi->s_alloc_mutex);
 
 	if (DQUOT_ALLOC_INODE(inode)) {
 		DQUOT_DROP(inode);
-- 
cgit v1.2.3


From 97e1cfb08616987878f91a46cefdd7fc5fa3dba1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 18 Aug 2008 13:44:48 +0200
Subject: udf: Fix error paths in udf_new_inode()

I case we failed to allocate memory for inode when creating it, we did not
properly free block already allocated for this inode. Move memory allocation
before the block allocation which fixes this issue (thanks for the idea go to
Ingo Oeser <ioe-lkml@rameria.de>). Also remove a few superfluous
initializations already done in udf_alloc_inode().

Reviewed-by: Ingo Oeser <ioe-lkml@rameria.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/ialloc.c | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c4943c8988c..a4f2b3ce45b 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	*err = -ENOSPC;
 
 	iinfo = UDF_I(inode);
-	iinfo->i_unique = 0;
-	iinfo->i_lenExtents = 0;
-	iinfo->i_next_alloc_block = 0;
-	iinfo->i_next_alloc_goal = 0;
-	iinfo->i_strat4096 = 0;
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
+		iinfo->i_efe = 1;
+		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
+			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct extendedFileEntry),
+					    GFP_KERNEL);
+	} else {
+		iinfo->i_efe = 0;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct fileEntry),
+					    GFP_KERNEL);
+	}
+	if (!iinfo->i_ext.i_data) {
+		iput(inode);
+		*err = -ENOMEM;
+		return NULL;
+	}
 
 	block = udf_new_block(dir->i_sb, NULL,
 			      dinfo->i_location.partitionReferenceNum,
@@ -130,24 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	iinfo->i_lenEAttr = 0;
 	iinfo->i_lenAlloc = 0;
 	iinfo->i_use = 0;
-	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-		iinfo->i_efe = 1;
-		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-					    sizeof(struct extendedFileEntry),
-					    GFP_KERNEL);
-	} else {
-		iinfo->i_efe = 0;
-		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-					    sizeof(struct fileEntry),
-					    GFP_KERNEL);
-	}
-	if (!iinfo->i_ext.i_data) {
-		iput(inode);
-		*err = -ENOMEM;
-		return NULL;
-	}
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
 	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
-- 
cgit v1.2.3


From 1f7c14c62ce63805f9574664a6c6de3633d4a354 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Thu, 9 Oct 2008 12:50:59 -0400
Subject: percpu counter: clean up percpu_counter_sum_and_set()

percpu_counter_sum_and_set() and percpu_counter_sum() is the same except
the former updates the global counter after accounting.  Since we are
taking the fbc->lock to calculate the precise value of the counter in
percpu_counter_sum() anyway, it should simply set fbc->count too, as the
percpu_counter_sum_and_set() does.

This patch merges these two interfaces into one.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e9fa960ba6d..00a94d5866c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1624,7 +1624,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 #ifdef CONFIG_SMP
 	if (free_blocks - root_blocks < FBC_BATCH)
 		free_blocks =
-			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+			percpu_counter_sum(&sbi->s_freeblocks_counter);
 #endif
 	if (free_blocks <= root_blocks)
 		/* we don't have free space */
-- 
cgit v1.2.3


From 4776004f54e4190e104caf620fd0fa5909412236 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Sep 2008 23:00:52 -0400
Subject: ext4: Add printk priority levels to clean up checkpatch warnings

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 29 +++++++++++++++--------------
 fs/ext4/dir.c     |  3 ++-
 fs/ext4/extents.c |  9 +++++----
 fs/ext4/ialloc.c  | 16 +++++++++-------
 fs/ext4/mballoc.c |  9 +++++----
 fs/ext4/namei.c   | 34 +++++++++++++++++++---------------
 fs/ext4/super.c   | 25 ++++++++++++++++---------
 7 files changed, 71 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 00a94d5866c..7aac2025ba9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -379,26 +379,28 @@ restart:
 	bad = 0;
 	prev = NULL;
 
-	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+	printk(KERN_DEBUG "Block Allocation Reservation "
+	       "Windows Map (%s):\n", fn);
 	while (n) {
 		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
 		if (verbose)
-			printk("reservation window 0x%p "
+			printk(KERN_DEBUG "reservation window 0x%p "
 			       "start:  %llu, end:  %llu\n",
 			       rsv, rsv->rsv_start, rsv->rsv_end);
 		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-			printk("Bad reservation %p (start >= end)\n",
+			printk(KERN_DEBUG "Bad reservation %p (start >= end)\n",
 			       rsv);
 			bad = 1;
 		}
 		if (prev && prev->rsv_end >= rsv->rsv_start) {
-			printk("Bad reservation %p (prev->end >= start)\n",
-			       rsv);
+			printk(KERN_DEBUG "Bad reservation %p "
+			       "(prev->end >= start)\n", rsv);
 			bad = 1;
 		}
 		if (bad) {
 			if (!verbose) {
-				printk("Restarting reservation walk in verbose mode\n");
+				printk(KERN_DEBUG "Restarting reservation "
+				       "walk in verbose mode\n");
 				verbose = 1;
 				goto restart;
 			}
@@ -406,7 +408,7 @@ restart:
 		n = rb_next(n);
 		prev = rsv;
 	}
-	printk("Window map complete.\n");
+	printk(KERN_DEBUG "Window map complete.\n");
 	BUG_ON(bad);
 }
 #define rsv_window_dump(root, verbose) \
@@ -1702,7 +1704,7 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 	sb = inode->i_sb;
 	if (!sb) {
 		*errp = -ENODEV;
-		printk("ext4_new_block: nonexistent device");
+		printk(KERN_ERR "ext4_new_block: nonexistent superblock");
 		return 0;
 	}
 
@@ -1884,8 +1886,8 @@ allocated:
 		for (i = 0; i < num; i++) {
 			if (ext4_test_bit(grp_alloc_blk+i,
 					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __func__);
+				printk(KERN_ERR "%s: block was unexpectedly "
+				       "set in b_committed_data\n", __func__);
 			}
 		}
 	}
@@ -2093,10 +2095,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_blocks: stored = %llu"
-		", computed = %llu, %llu\n",
-		ext4_free_blocks_count(es),
-		desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
+		", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+	       desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ec8e33b4521..ac873dbf0dd 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -400,7 +400,8 @@ static int call_filldir(struct file * filp, void * dirent,
 	sb = inode->i_sb;
 
 	if (!fname) {
-		printk("call_filldir: called with null fname?!?\n");
+		printk(KERN_ERR "ext4: call_filldir: called with "
+		       "null fname?!?\n");
 		return 0;
 	}
 	curr_pos = hash2pos(fname->hash, fname->minor_hash);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b24d3c53f20..a8db7fdf9cb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -440,9 +440,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 		  if (k != 0 &&
 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
-				printk("k=%d, ix=0x%p, first=0x%p\n", k,
-					ix, EXT_FIRST_INDEX(eh));
-				printk("%u <= %u\n",
+				printk(KERN_DEBUG "k=%d, ix=0x%p, "
+				       "first=0x%p\n", k,
+				       ix, EXT_FIRST_INDEX(eh));
+				printk(KERN_DEBUG "%u <= %u\n",
 				       le32_to_cpu(ix->ei_block),
 				       le32_to_cpu(ix[-1].ei_block));
 			}
@@ -2142,7 +2143,7 @@ void ext4_ext_init(struct super_block *sb)
 	 */
 
 	if (test_opt(sb, EXTENTS)) {
-		printk("EXT4-fs: file extents enabled");
+		printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
 		printk(", aggressive tests");
 #endif
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f344834bbf5..45c66a03f18 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -170,17 +170,18 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
-		printk ("ext4_free_inode: inode has count=%d\n",
-					atomic_read(&inode->i_count));
+		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+		       atomic_read(&inode->i_count));
 		return;
 	}
 	if (inode->i_nlink) {
-		printk ("ext4_free_inode: inode has nlink=%d\n",
-			inode->i_nlink);
+		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+		       inode->i_nlink);
 		return;
 	}
 	if (!sb) {
-		printk("ext4_free_inode: inode on nonexistent device\n");
+		printk(KERN_ERR "ext4_free_inode: inode on "
+		       "nonexistent device\n");
 		return;
 	}
 	sbi = EXT4_SB(sb);
@@ -989,8 +990,9 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
-		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_inodes: "
+	       "stored = %u, computed = %lu, %lu\n",
+	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
 	return desc_count;
 #else
 	desc_count = 0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e0e3a5eb1dd..0db2ccfa0da 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 		b2 = (unsigned char *) bitmap;
 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 			if (b1[i] != b2[i]) {
-				printk("corruption in group %lu at byte %u(%u):"
-				       " %x in copy != %x on disk/prealloc\n",
-					e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				printk(KERN_ERR "corruption in group %lu "
+				       "at byte %u(%u): %x in copy != %x "
+				       "on disk/prealloc\n",
+				       e4b->bd_group, i, i * 8, b1[i], b2[i]);
 				BUG();
 			}
 		}
@@ -2560,7 +2561,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
-	printk("EXT4-fs: mballoc enabled\n");
+	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
 }
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c..978b57f8630 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -254,12 +254,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
  * Debug
  */
 #ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
 {
 	int i, n = dx_get_count (entries);
-	printk("%s index ", label);
+	printk(KERN_DEBUG "%s index ", label);
 	for (i = 0; i < n; i++) {
-		printk("%x->%lu ", i? dx_get_hash(entries + i) :
+		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
 				0, (unsigned long)dx_get_block(entries + i));
 	}
 	printk("\n");
@@ -328,8 +328,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		brelse (bh);
 	}
 	if (bcount)
-		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
-			names, space/bcount,(space/bcount)*100/blocksize);
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+		       levels ? "" : "   ", names, space/bcount,
+		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
 }
 #endif /* DX_DEBUG */
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-		       start_minor_hash));
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 			break;
 	}
 	dx_release(frames);
-	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
-		       count, *next_hash));
+	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+		       "next hash: %x\n", count, *next_hash));
 	return count;
 errout:
 	dx_release(frames);
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 		 */
 		if (bh || (err != ERR_BAD_DX_DIR))
 			return bh;
-		dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+			       "falling back\n"));
 	}
 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
 	start = EXT4_I(dir)->i_dir_start_lookup;
@@ -1025,7 +1027,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 
 	*err = -ENOENT;
 errout:
-	dxtrace(printk("%s not found\n", name));
+	dxtrace(printk(KERN_DEBUG "%s not found\n", name));
 	dx_release (frames);
 	return NULL;
 }
@@ -1377,7 +1379,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk("Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index\n"));
 	retval = ext4_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext4_std_error(dir->i_sb, retval);
@@ -1527,7 +1529,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	}
 
 	/* Block full, should compress but for now just split */
-	dxtrace(printk("using %u of %u node entries\n",
+	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
 		       dx_get_count(entries), dx_get_limit(entries)));
 	/* Need to split index? */
 	if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1561,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		if (levels) {
 			unsigned icount1 = icount/2, icount2 = icount - icount1;
 			unsigned hash2 = dx_get_hash(entries + icount1);
-			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+				       icount1, icount2));
 
 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
 			err = ext4_journal_get_write_access(handle,
@@ -1588,7 +1591,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 				goto journal_error;
 			brelse (bh2);
 		} else {
-			dxtrace(printk("Creating second level index...\n"));
+			dxtrace(printk(KERN_DEBUG
+				       "Creating second level index...\n"));
 			memcpy((char *) entries2, (char *) entries,
 			       icount * sizeof(struct dx_entry));
 			dx_set_limit(entries2, dx_node_limit(dir));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 566344b926b..ee869546495 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -981,7 +981,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 	/*todo: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
-		printk("EXT4-fs: Invalid sb specification: %s\n",
+		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
 		       (char *) *data);
 		return 1;
 	}
@@ -1082,7 +1082,8 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk("EXT4 (no)user_xattr options not supported\n");
+			printk(KERN_ERR "EXT4 (no)user_xattr options "
+			       "not supported\n");
 			break;
 #endif
 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
@@ -1095,7 +1096,8 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk("EXT4 (no)acl options not supported\n");
+			printk(KERN_ERR "EXT4 (no)acl options "
+			       "not supported\n");
 			break;
 #endif
 		case Opt_reservation:
@@ -1189,8 +1191,8 @@ set_qf_name:
 			     sb_any_quota_suspended(sb)) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
-					"EXT4-fs: Cannot change journaled "
-					"quota options when quota turned on.\n");
+				       "EXT4-fs: Cannot change journaled "
+				       "quota options when quota turned on.\n");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
@@ -1473,14 +1475,14 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
 	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
 		char b[BDEVNAME_SIZE];
 
-		printk("external journal on %s\n",
-			bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
+		printk(KERN_INFO "EXT4 FS on %s, external journal on %s\n",
+		       sb->s_id, bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
 	} else {
-		printk("internal journal\n");
+		printk(KERN_INFO "EXT4 FS on %s, internal journal\n",
+		       sb->s_id);
 	}
 	return res;
 }
@@ -2715,6 +2717,11 @@ static int ext4_load_journal(struct super_block *sb,
 			return -EINVAL;
 	}
 
+	if (journal->j_flags & JBD2_BARRIER)
+		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+	else
+		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = jbd2_journal_update_format(journal);
 		if (err)  {
-- 
cgit v1.2.3


From e5f8eab8851dff162e7ade46f084cb8575dc45f7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Sep 2008 22:25:04 -0400
Subject: ext4: Fix long long checkpatch warnings

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee869546495..c10aaf7d83c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1716,9 +1716,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		DQUOT_INIT(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %Ld bytes\n",
+				"%s: truncating inode %lu to %lld bytes\n",
 				__func__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
 			nr_truncates++;
@@ -2554,7 +2554,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 		return NULL;
 	}
 
-	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
 		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -3439,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	handle_t *handle = journal_current_handle();
 
 	if (!handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
+		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
 			" cancelled because transaction is not started.\n",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
-- 
cgit v1.2.3


From af5bc92dded4d98dfeabc8b5b9812571345b263d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Sep 2008 22:25:24 -0400
Subject: ext4: Fix whitespace checkpatch warnings/errors

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/acl.h     |   6 +-
 fs/ext4/balloc.c  |  70 ++++++-------
 fs/ext4/bitmap.c  |   6 +-
 fs/ext4/dir.c     |  52 +++++-----
 fs/ext4/ext4.h    |  68 ++++++------
 fs/ext4/ext4_sb.h |   8 +-
 fs/ext4/extents.c |   6 +-
 fs/ext4/file.c    |   2 +-
 fs/ext4/fsync.c   |   2 +-
 fs/ext4/hash.c    |   8 +-
 fs/ext4/ialloc.c  |  50 ++++-----
 fs/ext4/inode.c   |  96 ++++++++---------
 fs/ext4/ioctl.c   |   4 +-
 fs/ext4/namei.c   | 302 +++++++++++++++++++++++++++---------------------------
 fs/ext4/resize.c  |   6 +-
 fs/ext4/super.c   |   6 +-
 fs/ext4/symlink.c |   4 +-
 fs/ext4/xattr.h   |   4 +-
 18 files changed, 350 insertions(+), 350 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855a07d..4c9948f69f8 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -58,9 +58,9 @@ static inline int ext4_acl_count(size_t size)
 #define EXT4_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext4_permission (struct inode *, int);
-extern int ext4_acl_chmod (struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+extern int ext4_permission(struct inode *, int);
+extern int ext4_acl_chmod(struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
 #include <linux/sched.h>
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 7aac2025ba9..58005c01abb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -132,7 +132,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
 			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
+			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -200,20 +200,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * @bh:			pointer to the buffer head to store the block
  *			group descriptor
  */
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 					     ext4_group_t block_group,
-					     struct buffer_head ** bh)
+					     struct buffer_head **bh)
 {
 	unsigned long group_desc;
 	unsigned long offset;
-	struct ext4_group_desc * desc;
+	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (block_group >= sbi->s_groups_count) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "block_group >= groups_count - "
-			    "block_group = %lu, groups_count = %lu",
-			    block_group, sbi->s_groups_count);
+		ext4_error(sb, "ext4_get_group_desc",
+			   "block_group >= groups_count - "
+			   "block_group = %lu, groups_count = %lu",
+			   block_group, sbi->s_groups_count);
 
 		return NULL;
 	}
@@ -222,10 +222,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 	if (!sbi->s_group_desc[group_desc]) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "Group descriptor not loaded - "
-			    "block_group = %lu, group_desc = %lu, desc = %lu",
-			     block_group, group_desc, offset);
+		ext4_error(sb, "ext4_get_group_desc",
+			   "Group descriptor not loaded - "
+			   "block_group = %lu, group_desc = %lu, desc = %lu",
+			   block_group, group_desc, offset);
 		return NULL;
 	}
 
@@ -302,8 +302,8 @@ err_out:
 struct buffer_head *
 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
-	struct ext4_group_desc * desc;
-	struct buffer_head * bh = NULL;
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
 	ext4_fsblk_t bitmap_blk;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -506,8 +506,8 @@ void ext4_rsv_window_add(struct super_block *sb,
 	struct rb_node *node = &rsv->rsv_node;
 	ext4_fsblk_t start = rsv->rsv_start;
 
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct ext4_reserve_window_node *this;
 
 	while (*p)
@@ -661,8 +661,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	ext4_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
-	struct ext4_group_desc * desc;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *desc;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int err = 0, ret;
 	ext4_grpblk_t group_freed;
@@ -673,13 +673,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > ext4_blocks_count(es)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks not in datazone - "
-			    "block = %llu, count = %lu", block, count);
+		ext4_error(sb, "ext4_free_blocks",
+			   "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-	ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+	ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
 
 do_more:
 	overflow = 0;
@@ -696,7 +696,7 @@ do_more:
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
-	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
 	if (!desc)
 		goto error_return;
 
@@ -705,10 +705,10 @@ do_more:
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks in system zones - "
-			    "Block = %llu, count = %lu",
-			    block, count);
+		ext4_error(sb, "ext4_free_blocks",
+			   "Freeing blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
 		goto error_return;
 	}
 
@@ -850,7 +850,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count,
 			int metadata)
 {
-	struct super_block * sb;
+	struct super_block *sb;
 	unsigned long dquot_freed_blocks;
 
 	/* this isn't the right place to decide whether block is metadata
@@ -1019,7 +1019,7 @@ claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
 	if (ext4_set_bit_atomic(lock, block, bh->b_data))
 		return 0;
 	jbd_lock_bh_state(bh);
-	if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
+	if (jh->b_committed_data && ext4_test_bit(block, jh->b_committed_data)) {
 		ext4_clear_bit_atomic(lock, block, bh->b_data);
 		ret = 0;
 	} else {
@@ -1170,7 +1170,7 @@ fail_access:
 static int find_next_reservable_window(
 				struct ext4_reserve_window_node *search_head,
 				struct ext4_reserve_window_node *my_rsv,
-				struct super_block * sb,
+				struct super_block *sb,
 				ext4_fsblk_t start_block,
 				ext4_fsblk_t last_block)
 {
@@ -1204,7 +1204,7 @@ static int find_next_reservable_window(
 
 		prev = rsv;
 		next = rb_next(&rsv->rsv_node);
-		rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
+		rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
 
 		/*
 		 * Reached the last reservation, we can just append to the
@@ -1342,7 +1342,7 @@ static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
 			size = size * 2;
 			if (size > EXT4_MAX_RESERVE_BLOCKS)
 				size = EXT4_MAX_RESERVE_BLOCKS;
-			my_rsv->rsv_goal_size= size;
+			my_rsv->rsv_goal_size = size;
 		}
 	}
 
@@ -1491,7 +1491,7 @@ static ext4_grpblk_t
 ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			ext4_group_t group, struct buffer_head *bitmap_bh,
 			ext4_grpblk_t grp_goal,
-			struct ext4_reserve_window_node * my_rsv,
+			struct ext4_reserve_window_node *my_rsv,
 			unsigned long *count, int *errp)
 {
 	ext4_fsblk_t group_first_block, group_last_block;
@@ -1519,7 +1519,7 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 * or the file is not a regular file
 	 * or last attempt to allocate a block with reservation turned on failed
 	 */
-	if (my_rsv == NULL ) {
+	if (my_rsv == NULL) {
 		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
 						grp_goal, count, NULL);
 		goto out;
@@ -2184,7 +2184,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
 			metagroup < first_meta_bg)
-		return ext4_bg_num_gdb_nometa(sb,group);
+		return ext4_bg_num_gdb_nometa(sb, group);
 
 	return ext4_bg_num_gdb_meta(sb,group);
 
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea675045..0a7a6663c19 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
 	unsigned int i;
 	unsigned long sum = 0;
 
 	if (!map)
-		return (0);
+		return 0;
 	for (i = 0; i < numchars; i++)
 		sum += nibblemap[map->b_data[i] & 0xf] +
 			nibblemap[(map->b_data[i] >> 4) & 0xf];
-	return (sum);
+	return sum;
 }
 
 #endif  /*  EXT4FS_DEBUG  */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ac873dbf0dd..d40da316921 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
 };
 
 static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
-			   void * dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
-				struct file * filp);
+static int ext4_dx_readdir(struct file *filp,
+			   void *dirent, filldir_t filldir);
+static int ext4_release_dir(struct inode *inode,
+				struct file *filp);
 
 const struct file_operations ext4_dir_operations = {
 	.llseek		= generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 }
 
 
-int ext4_check_dir_entry (const char * function, struct inode * dir,
-			  struct ext4_dir_entry_2 * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
+int ext4_check_dir_entry(const char *function, struct inode *dir,
+			 struct ext4_dir_entry_2 *de,
+			 struct buffer_head *bh,
+			 unsigned long offset)
 {
-	const char * error_msg = NULL;
+	const char *error_msg = NULL;
 	const int rlen = ext4_rec_len_from_disk(de->rec_len);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ext4_error (dir->i_sb, function,
+		ext4_error(dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
 	return error_msg == NULL ? 1 : 0;
 }
 
-static int ext4_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
 {
 	int error = 0;
 	unsigned long offset;
@@ -148,7 +148,7 @@ static int ext4_readdir(struct file * filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext4_error (sb, "ext4_readdir",
+			ext4_error(sb, "ext4_readdir",
 				"directory #%lu contains a hole at offset %lu",
 				inode->i_ino, (unsigned long)filp->f_pos);
 			/* corrupt size?  Maybe no more blocks to read */
@@ -187,14 +187,14 @@ revalidate:
 		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
-						   bh, offset)) {
+			if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+						  bh, offset)) {
 				/*
 				 * On error, skip the f_pos to the next block
 				 */
 				filp->f_pos = (filp->f_pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse (bh);
+				brelse(bh);
 				ret = stored;
 				goto out;
 			}
@@ -218,12 +218,12 @@ revalidate:
 					break;
 				if (version != filp->f_version)
 					goto revalidate;
-				stored ++;
+				stored++;
 			}
 			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
-		brelse (bh);
+		brelse(bh);
 	}
 out:
 	return ret;
@@ -290,9 +290,9 @@ static void free_rb_tree_fname(struct rb_root *root)
 		parent = rb_parent(n);
 		fname = rb_entry(n, struct fname, rb_hash);
 		while (fname) {
-			struct fname * old = fname;
+			struct fname *old = fname;
 			fname = fname->next;
-			kfree (old);
+			kfree(old);
 		}
 		if (!parent)
 			root->rb_node = NULL;
@@ -331,7 +331,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 			     struct ext4_dir_entry_2 *dirent)
 {
 	struct rb_node **p, *parent = NULL;
-	struct fname * fname, *new_fn;
+	struct fname *fname, *new_fn;
 	struct dir_private_info *info;
 	int len;
 
@@ -388,13 +388,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file * filp, void * dirent,
+static int call_filldir(struct file *filp, void *dirent,
 			filldir_t filldir, struct fname *fname)
 {
 	struct dir_private_info *info = filp->private_data;
 	loff_t	curr_pos;
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct super_block * sb;
+	struct super_block *sb;
 	int error;
 
 	sb = inode->i_sb;
@@ -420,8 +420,8 @@ static int call_filldir(struct file * filp, void * dirent,
 	return 0;
 }
 
-static int ext4_dx_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
 {
 	struct dir_private_info *info = filp->private_data;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -512,7 +512,7 @@ finished:
 	return 0;
 }
 
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 295003241d3..8c701318844 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
 #ifdef EXT4FS_DEBUG
 #define ext4_debug(f, a...)						\
 	do {								\
-		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
 			__FILE__, __LINE__, __func__);			\
-		printk (KERN_DEBUG f, ## a);				\
+		printk(KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
 #define ext4_debug(f, a...)	do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
 #else
 # define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
-#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof (__u32))
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
 #else
@@ -292,7 +292,7 @@ struct ext4_new_group_data {
 #define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
 #define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
 #define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD		_IOW('f', 8,struct ext4_new_group_input)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
@@ -667,7 +667,7 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
-static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
@@ -725,11 +725,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
  */
 
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
@@ -985,13 +985,13 @@ extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 						ext4_fsblk_t nblocks);
-extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
-				 ext4_fsblk_t block, unsigned long count,
+extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count,
 				unsigned long *pdquot_freed_blocks);
-extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
-extern void ext4_check_blocks_bitmap (struct super_block *);
+extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
@@ -1009,20 +1009,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext4_sync_file (struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, struct dentry *, int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
 /* ialloc.c */
-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
-extern void ext4_free_inode (handle_t *, struct inode *);
-extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
-extern unsigned long ext4_count_free_inodes (struct super_block *);
-extern unsigned long ext4_count_dirs (struct super_block *);
-extern void ext4_check_inodes_bitmap (struct super_block *);
-extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
+extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1056,18 +1056,18 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 				int create, int extend_disksize);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode (struct inode *, int);
-extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_write_inode(struct inode *, int);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
-extern void ext4_delete_inode (struct inode *);
-extern int  ext4_sync_inode (handle_t *, struct inode *);
-extern void ext4_discard_reservation (struct inode *);
+extern void ext4_delete_inode(struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
+extern void ext4_discard_reservation(struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate (struct inode *);
+extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
@@ -1080,7 +1080,7 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
@@ -1099,14 +1099,14 @@ extern int ext4_group_extend(struct super_block *sb,
 				ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext4_error (struct super_block *, const char *, const char *, ...)
+extern void ext4_error(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void __ext4_std_error (struct super_block *, const char *, int);
-extern void ext4_abort (struct super_block *, const char *, const char *, ...)
+extern void __ext4_std_error(struct super_block *, const char *, int);
+extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning (struct super_block *, const char *, const char *, ...)
+extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_update_dynamic_rev (struct super_block *sb);
+extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1179,7 +1179,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
 
 static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
-							ext4_group_t group)
+					    ext4_group_t group)
 {
 	 struct ext4_group_info ***grp_info;
 	 long indexv, indexh;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d553..69810a25253 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
 	unsigned long s_blocks_last;    /* Last seen block count */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext4_super_block * s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head ** s_group_desc;
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
 	unsigned long  s_mount_opt;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
@@ -67,8 +67,8 @@ struct ext4_sb_info {
 	struct ext4_reserve_window_node s_rsv_window_head;
 
 	/* Journaling */
-	struct inode * s_journal_inode;
-	struct journal_s * s_journal;
+	struct inode *s_journal_inode;
+	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
 	struct block_device *journal_bdev;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a8db7fdf9cb..797f0602a68 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -383,8 +383,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	ext_debug("\n");
 }
 #else
-#define ext4_ext_show_path(inode,path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
 #endif
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -1476,7 +1476,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path,
 				struct ext4_extent *newext)
 {
-	struct ext4_extent_header * eh;
+	struct ext4_extent_header *eh;
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db..11b289f42b7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,7 +31,7 @@
  * from ext4_file_open: open gets called at every open, but release
  * gets called only when /all/ the files are closed.
  */
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
 {
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad3..c37d1e86f51 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -43,7 +43,7 @@
  * inode to disk.
  */
 
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe39..556ca8eba3d 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 		sum += DELTA;
 		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
 		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-	} while(--n);
+	} while (--n);
 
 	buf[0] += b0;
 	buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash(const char *name, int len)
 {
 	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 	while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i=0; i < len; i++) {
+	for (i = 0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
 		val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 
 	/* Check to see if the seed is all zero's */
 	if (hinfo->seed) {
-		for (i=0; i < 4; i++) {
+		for (i = 0; i < 4; i++) {
 			if (hinfo->seed[i])
 				break;
 		}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45c66a03f18..5e66a2feef0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -154,17 +154,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
  * though), and then we'd have two inodes sharing the
  * same inode number and space on the harddisk.
  */
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
 {
-	struct super_block * sb = inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	int is_directory;
 	unsigned long ino;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t block_group;
 	unsigned long bit;
-	struct ext4_group_desc * gdp;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
 	ext4_group_t flex_group;
@@ -187,7 +187,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	sbi = EXT4_SB(sb);
 
 	ino = inode->i_ino;
-	ext4_debug ("freeing inode %lu\n", ino);
+	ext4_debug("freeing inode %lu\n", ino);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -201,12 +201,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	is_directory = S_ISDIR(inode->i_mode);
 
 	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
+	clear_inode(inode);
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext4_error (sb, "ext4_free_inode",
-			    "reserved or nonexistent inode %lu", ino);
+		ext4_error(sb, "ext4_free_inode",
+			   "reserved or nonexistent inode %lu", ino);
 		goto error_return;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -223,10 +223,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	/* Ok, now we can actually update the inode bitmaps.. */
 	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 					bit, bitmap_bh->b_data))
-		ext4_error (sb, "ext4_free_inode",
-			      "bit already cleared for inode %lu", ino);
+		ext4_error(sb, "ext4_free_inode",
+			   "bit already cleared for inode %lu", ino);
 	else {
-		gdp = ext4_get_group_desc (sb, block_group, &bh2);
+		gdp = ext4_get_group_desc(sb, block_group, &bh2);
 
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
@@ -288,7 +288,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 	avefreei = freei / ngroups;
 
 	for (group = 0; group < ngroups; group++) {
-		desc = ext4_get_group_desc (sb, group, NULL);
+		desc = ext4_get_group_desc(sb, group, NULL);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
 		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -577,16 +577,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t group = 0;
 	unsigned long ino = 0;
-	struct inode * inode;
-	struct ext4_group_desc * gdp = NULL;
-	struct ext4_super_block * es;
+	struct inode *inode;
+	struct ext4_group_desc *gdp = NULL;
+	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
@@ -614,7 +614,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	}
 
 	if (S_ISDIR(mode)) {
-		if (test_opt (sb, OLDALLOC))
+		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
 			ret2 = find_group_orlov(sb, dir, &group);
@@ -784,7 +784,7 @@ got:
 	}
 
 	inode->i_uid = current->fsuid;
-	if (test_opt (sb, GRPID))
+	if (test_opt(sb, GRPID))
 		inode->i_gid = dir->i_gid;
 	else if (dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
@@ -833,7 +833,7 @@ got:
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if(DQUOT_ALLOC_INODE(inode)) {
+	if (DQUOT_ALLOC_INODE(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -842,7 +842,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	err = ext4_init_security(handle,inode, dir);
+	err = ext4_init_security(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
 
@@ -960,7 +960,7 @@ error:
 	return ERR_PTR(err);
 }
 
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
@@ -975,7 +975,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 	bitmap_count = 0;
 	gdp = NULL;
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -997,7 +997,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 #else
 	desc_count = 0;
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1008,13 +1008,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 }
 
 /* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
 	ext4_group_t i;
 
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7e91913e325..89c92c0f829 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -190,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext4_delete_inode (struct inode * inode)
+void ext4_delete_inode(struct inode *inode)
 {
 	handle_t *handle;
 	int err;
@@ -330,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
 	int final = 0;
 
 	if (i_block < 0) {
-		ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+		ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
-	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT4_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
@@ -400,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 
 	*err = 0;
 	/* i_data is not going away, no lock needed */
-	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
-		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
@@ -443,7 +443,7 @@ no_block:
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
@@ -630,7 +630,7 @@ allocated:
 	*err = 0;
 	return ret;
 failed_out:
-	for (i = 0; i <index; i++)
+	for (i = 0; i < index; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 	return ret;
 }
@@ -703,7 +703,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
-		if ( n == indirect_blks) {
+		if (n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
@@ -730,7 +730,7 @@ failed:
 		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
 		ext4_journal_forget(handle, branch[i].bh);
 	}
-	for (i = 0; i <indirect_blks; i++)
+	for (i = 0; i < indirect_blks; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 
 	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -783,7 +783,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
-			*(where->p + i ) = cpu_to_le32(current_block++);
+			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 
 	/*
@@ -1241,7 +1241,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 			BUFFER_TRACE(bh, "call get_create_access");
 			fatal = ext4_journal_get_create_access(handle, bh);
 			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data,0,inode->i_sb->s_blocksize);
+				memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
@@ -1266,7 +1266,7 @@ err:
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
@@ -1282,13 +1282,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 	return NULL;
 }
 
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+static int walk_page_buffers(handle_t *handle,
+			     struct buffer_head *head,
+			     unsigned from,
+			     unsigned to,
+			     int *partial,
+			     int (*fn)(handle_t *handle,
+				       struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -1296,9 +1296,9 @@ static int walk_page_buffers(	handle_t *handle,
 	int err, ret = 0;
 	struct buffer_head *next;
 
-	for (	bh = head, block_start = 0;
-		ret == 0 && (bh != head || !block_start);
-		block_start = block_end, bh = next)
+	for (bh = head, block_start = 0;
+	     ret == 0 && (bh != head || !block_start);
+	     block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
@@ -1351,23 +1351,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
- 	struct inode *inode = mapping->host;
+	struct inode *inode = mapping->host;
 	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle_t *handle;
 	int retries = 0;
- 	struct page *page;
+	struct page *page;
  	pgoff_t index;
- 	unsigned from, to;
+	unsigned from, to;
 
  	index = pos >> PAGE_CACHE_SHIFT;
- 	from = pos & (PAGE_CACHE_SIZE - 1);
- 	to = from + len;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
 
 retry:
-  	handle = ext4_journal_start(inode, needed_blocks);
-  	if (IS_ERR(handle)) {
-  		ret = PTR_ERR(handle);
-  		goto out;
+	handle = ext4_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
 	}
 
 	page = __grab_cache_page(mapping, index);
@@ -1387,9 +1387,9 @@ retry:
 	}
 
 	if (ret) {
- 		unlock_page(page);
+		unlock_page(page);
 		ext4_journal_stop(handle);
- 		page_cache_release(page);
+		page_cache_release(page);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2456,7 +2456,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 	bh = page_buffers(page);
 	idx = offset >> inode->i_blkbits;
 
-	for (i=0; i < idx; i++)
+	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 
 	if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2476,7 +2476,7 @@ static int ext4_da_write_end(struct file *file,
 	unsigned long start, end;
 
 	start = pos & (PAGE_CACHE_SIZE - 1);
-	end = start + copied -1;
+	end = start + copied - 1;
 
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2591,7 +2591,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 			return 0;
 	}
 
-	return generic_block_bmap(mapping,block,ext4_get_block);
+	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3197,7 +3197,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
-	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
@@ -3216,7 +3216,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
 	}
 	/* Writer: end */
 
-	while(partial > p) {
+	while (partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
@@ -3408,9 +3408,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext4_free_branches(handle, inode, bh,
-					   (__le32*)bh->b_data,
-					   (__le32*)bh->b_data + addr_per_block,
-					   depth);
+					(__le32 *) bh->b_data,
+					(__le32 *) bh->b_data + addr_per_block,
+					depth);
 
 			/*
 			 * We've probably journalled the indirect block several
@@ -3927,7 +3927,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-	if(!(test_opt (inode->i_sb, NO_UID32))) {
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
@@ -3945,7 +3945,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
-			brelse (bh);
+			brelse(bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
@@ -3978,7 +3978,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
-			brelse (bh);
+			brelse(bh);
 			ret = -EIO;
 			goto bad_inode;
 		}
@@ -4031,7 +4031,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
-	brelse (iloc.bh);
+	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
@@ -4113,14 +4113,14 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-	if(!(test_opt(inode->i_sb, NO_UID32))) {
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
-		if(!ei->i_dtime) {
+		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
@@ -4208,7 +4208,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	ei->i_state &= ~EXT4_STATE_NEW;
 
 out_brelse:
-	brelse (bh);
+	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba..ca09dd1039e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -25,7 +25,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	unsigned int flags;
 	unsigned short rsv_window_size;
 
-	ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
@@ -186,7 +186,7 @@ setversion_out:
 	case EXT4_IOC_SETRSVSZ: {
 		int err;
 
-		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+		if (!test_opt(inode->i_sb, RESERVATION) || !S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
 		if (!is_owner_or_cap(inode))
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 978b57f8630..a1f72d217c7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,26 +151,26 @@ struct dx_map_entry
 
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
 static struct dx_frame *dx_probe(struct dentry *dentry,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
 				 int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
 		struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -207,44 +207,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
 	entry->block = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
 {
 	return le32_to_cpu(entry->hash);
 }
 
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
 {
 	entry->hash = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
 }
 
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
 }
 
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
 }
 
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
 }
 
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
 	return entry_space / sizeof(struct dx_entry);
 }
 
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
 	return entry_space / sizeof(struct dx_entry);
@@ -306,7 +306,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 			     struct dx_entry *entries, int levels)
 {
 	unsigned blocksize = dir->i_sb->s_blocksize;
-	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
 	unsigned bcount = 0;
 	struct buffer_head *bh;
 	int err;
@@ -325,7 +325,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		names += stats.names;
 		space += stats.space;
 		bcount += stats.bcount;
-		brelse (bh);
+		brelse(bh);
 	}
 	if (bcount)
 		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
@@ -407,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 		goto fail;
 	}
 
-	dxtrace (printk("Look up %x", hash));
+	dxtrace(printk("Look up %x", hash));
 	while (1)
 	{
 		count = dx_get_count(entries);
@@ -556,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				      0, &err)))
 			return err; /* Failure */
 		p++;
-		brelse (p->bh);
+		brelse(p->bh);
 		p->bh = bh;
 		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
 	}
@@ -594,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 			/* On error, skip the f_pos to the next block. */
 			dir_file->f_pos = (dir_file->f_pos |
 					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse (bh);
+			brelse(bh);
 			return count;
 		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -803,7 +803,7 @@ static inline int ext4_match (int len, const char * const name,
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-static inline int search_dirblock(struct buffer_head * bh,
+static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
 				  struct dentry *dentry,
 				  unsigned long offset,
@@ -855,9 +855,9 @@ static inline int search_dirblock(struct buffer_head * bh,
 static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 					struct ext4_dir_entry_2 ** res_dir)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh, *ret = NULL;
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block, b;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
@@ -958,7 +958,7 @@ restart:
 cleanup_and_exit:
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
-		brelse (bh_use[ra_ptr]);
+		brelse(bh_use[ra_ptr]);
 	return ret;
 }
 
@@ -1012,7 +1012,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 				return bh;
 			}
 		}
-		brelse (bh);
+		brelse(bh);
 		/* Check to see if we should continue to search */
 		retval = ext4_htree_next_block(dir, hash, frame,
 					       frames, NULL);
@@ -1032,11 +1032,11 @@ errout:
 	return NULL;
 }
 
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode * inode;
-	struct ext4_dir_entry_2 * de;
-	struct buffer_head * bh;
+	struct inode *inode;
+	struct ext4_dir_entry_2 *de;
+	struct buffer_head *bh;
 
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -1045,7 +1045,7 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
 	inode = NULL;
 	if (bh) {
 		unsigned long ino = le32_to_cpu(de->inode);
-		brelse (bh);
+		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
 			ext4_error(dir->i_sb, "ext4_lookup",
 				   "bad inode number: %lu", ino);
@@ -1203,10 +1203,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	/* create map in the end of data2 block */
 	map = (struct dx_map_entry *) (data2 + blocksize);
-	count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
 			     blocksize, hinfo, map);
 	map -= count;
-	dx_sort_map (map, count);
+	dx_sort_map(map, count);
 	/* Split the existing block in the middle, size-wise */
 	size = 0;
 	move = 0;
@@ -1227,7 +1227,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	/* Fancy dance to stay within two buffers */
 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
-	de = dx_pack_dirents(data1,blocksize);
+	de = dx_pack_dirents(data1, blocksize);
 	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
 	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1239,15 +1239,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		swap(*bh, bh2);
 		de = de2;
 	}
-	dx_insert_block (frame, hash2 + continued, newblock);
-	err = ext4_journal_dirty_metadata (handle, bh2);
+	dx_insert_block(frame, hash2 + continued, newblock);
+	err = ext4_journal_dirty_metadata(handle, bh2);
 	if (err)
 		goto journal_error;
-	err = ext4_journal_dirty_metadata (handle, frame->bh);
+	err = ext4_journal_dirty_metadata(handle, frame->bh);
 	if (err)
 		goto journal_error;
-	brelse (bh2);
-	dxtrace(dx_show_index ("frame", frame->entries));
+	brelse(bh2);
+	dxtrace(dx_show_index("frame", frame->entries));
 	return de;
 
 journal_error:
@@ -1273,7 +1273,7 @@ errout:
  */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode, struct ext4_dir_entry_2 *de,
-			     struct buffer_head * bh)
+			     struct buffer_head *bh)
 {
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
@@ -1290,11 +1290,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
 						  bh, offset)) {
-				brelse (bh);
+				brelse(bh);
 				return -EIO;
 			}
-			if (ext4_match (namelen, name, de)) {
-				brelse (bh);
+			if (ext4_match(namelen, name, de)) {
+				brelse(bh);
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1331,7 +1331,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	} else
 		de->inode = 0;
 	de->name_len = namelen;
-	memcpy (de->name, name, namelen);
+	memcpy(de->name, name, namelen);
 	/*
 	 * XXX shouldn't update any times until successful
 	 * completion of syscall, but too many callers depend
@@ -1388,7 +1388,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
-	bh2 = ext4_append (handle, dir, &block, &retval);
+	bh2 = ext4_append(handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
 		return retval;
@@ -1414,9 +1414,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 	entries = root->entries;
-	dx_set_block (entries, 1);
-	dx_set_count (entries, 1);
-	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+	dx_set_block(entries, 1);
+	dx_set_count(entries, 1);
+	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
@@ -1445,14 +1445,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
  * may not sleep between calling this and putting something into
  * the entry, as someone else might have used it while you slept.
  */
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
-	struct inode *inode)
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+			  struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	unsigned long offset;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
-	struct super_block * sb;
+	struct super_block *sb;
 	int	retval;
 	int	dx_fallback=0;
 	unsigned blocksize;
@@ -1502,9 +1502,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
 	struct dx_hash_info hinfo;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct inode *dir = dentry->d_parent->d_inode;
-	struct super_block * sb = dir->i_sb;
+	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
 
@@ -1570,11 +1570,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 
-			memcpy ((char *) entries2, (char *) (entries + icount1),
-				icount2 * sizeof(struct dx_entry));
-			dx_set_count (entries, icount1);
-			dx_set_count (entries2, icount2);
-			dx_set_limit (entries2, dx_node_limit(dir));
+			memcpy((char *) entries2, (char *) (entries + icount1),
+			       icount2 * sizeof(struct dx_entry));
+			dx_set_count(entries, icount1);
+			dx_set_count(entries2, icount2);
+			dx_set_limit(entries2, dx_node_limit(dir));
 
 			/* Which index block gets the new entry? */
 			if (at - entries >= icount1) {
@@ -1582,9 +1582,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 				frame->entries = entries = entries2;
 				swap(frame->bh, bh2);
 			}
-			dx_insert_block (frames + 0, hash2, newblock);
-			dxtrace(dx_show_index ("node", frames[1].entries));
-			dxtrace(dx_show_index ("node",
+			dx_insert_block(frames + 0, hash2, newblock);
+			dxtrace(dx_show_index("node", frames[1].entries));
+			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
 			err = ext4_journal_dirty_metadata(handle, bh2);
 			if (err)
@@ -1634,12 +1634,12 @@ cleanup:
  * ext4_delete_entry deletes a directory entry by merging it with the
  * previous entry
  */
-static int ext4_delete_entry (handle_t *handle,
-			      struct inode * dir,
-			      struct ext4_dir_entry_2 * de_del,
-			      struct buffer_head * bh)
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
 {
-	struct ext4_dir_entry_2 * de, * pde;
+	struct ext4_dir_entry_2 *de, *pde;
 	int i;
 
 	i = 0;
@@ -1720,11 +1720,11 @@ static int ext4_add_nondir(handle_t *handle,
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
-		struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int err, retries = 0;
 
 retry:
@@ -1751,8 +1751,8 @@ retry:
 	return err;
 }
 
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
-			int mode, dev_t rdev)
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -1771,7 +1771,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1786,12 +1786,12 @@ retry:
 	return err;
 }
 
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	handle_t *handle;
-	struct inode * inode;
-	struct buffer_head * dir_block;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *dir_block;
+	struct ext4_dir_entry_2 *de;
 	int err, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
@@ -1807,7 +1807,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -1815,7 +1815,7 @@ retry:
 	inode->i_op = &ext4_dir_inode_operations;
 	inode->i_fop = &ext4_dir_operations;
 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	dir_block = ext4_bread (handle, inode, 0, 1, &err);
+	dir_block = ext4_bread(handle, inode, 0, 1, &err);
 	if (!dir_block)
 		goto out_clear_inode;
 	BUFFER_TRACE(dir_block, "get_write_access");
@@ -1824,26 +1824,26 @@ retry:
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
 	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
-	strcpy (de->name, ".");
+	strcpy(de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	de = ext4_next_entry(de);
 	de->inode = cpu_to_le32(dir->i_ino);
 	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
 						EXT4_DIR_REC_LEN(1));
 	de->name_len = 2;
-	strcpy (de->name, "..");
+	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
 	ext4_journal_dirty_metadata(handle, dir_block);
-	brelse (dir_block);
+	brelse(dir_block);
 	ext4_mark_inode_dirty(handle, inode);
-	err = ext4_add_entry (handle, dentry, inode);
+	err = ext4_add_entry(handle, dentry, inode);
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
 		ext4_mark_inode_dirty(handle, inode);
-		iput (inode);
+		iput(inode);
 		goto out_stop;
 	}
 	ext4_inc_count(handle, dir);
@@ -1860,17 +1860,17 @@ out_stop:
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
 {
 	unsigned long offset;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de, * de1;
-	struct super_block * sb;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *de1;
+	struct super_block *sb;
 	int err = 0;
 
 	sb = inode->i_sb;
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-	    !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
 			ext4_error(inode->i_sb, __func__,
 				   "error %d reading directory #%lu offset 0",
@@ -1885,23 +1885,23 @@ static int empty_dir (struct inode * inode)
 	de1 = ext4_next_entry(de);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
-			strcmp (".", de->name) ||
-			strcmp ("..", de1->name)) {
-		ext4_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		brelse (bh);
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+		ext4_warning(inode->i_sb, "empty_dir",
+			     "bad directory (dir #%lu) - no `.' or `..'",
+			     inode->i_ino);
+		brelse(bh);
 		return 1;
 	}
 	offset = ext4_rec_len_from_disk(de->rec_len) +
 		 ext4_rec_len_from_disk(de1->rec_len);
 	de = ext4_next_entry(de1);
-	while (offset < inode->i_size ) {
+	while (offset < inode->i_size) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
 			err = 0;
-			brelse (bh);
-			bh = ext4_bread (NULL, inode,
+			brelse(bh);
+			bh = ext4_bread(NULL, inode,
 				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
 			if (!bh) {
 				if (err)
@@ -1921,13 +1921,13 @@ static int empty_dir (struct inode * inode)
 			continue;
 		}
 		if (le32_to_cpu(de->inode)) {
-			brelse (bh);
+			brelse(bh);
 			return 0;
 		}
 		offset += ext4_rec_len_from_disk(de->rec_len);
 		de = ext4_next_entry(de);
 	}
-	brelse (bh);
+	brelse(bh);
 	return 1;
 }
 
@@ -1958,8 +1958,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
 	 */
-	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2073,12 +2073,12 @@ out_brelse:
 	goto out_err;
 }
 
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
 	/* Initialize quotas before so that eventual writes go in
@@ -2089,7 +2089,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dentry, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2103,16 +2103,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	retval = -ENOTEMPTY;
-	if (!empty_dir (inode))
+	if (!empty_dir(inode))
 		goto end_rmdir;
 
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning (inode->i_sb, "ext4_rmdir",
-			      "empty directory has too many links (%d)",
-			      inode->i_nlink);
+		ext4_warning(inode->i_sb, "ext4_rmdir",
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
 	/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2128,16 +2128,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 
 end_rmdir:
 	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
 	return retval;
 }
 
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
 	/* Initialize quotas before so that eventual writes go
@@ -2151,7 +2151,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 		handle->h_sync = 1;
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dentry, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2162,9 +2162,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 		goto end_unlink;
 
 	if (!inode->i_nlink) {
-		ext4_warning (inode->i_sb, "ext4_unlink",
-			      "Deleting nonexistent file (%lu), %d",
-			      inode->i_ino, inode->i_nlink);
+		ext4_warning(inode->i_sb, "ext4_unlink",
+			     "Deleting nonexistent file (%lu), %d",
+			     inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
 	}
 	retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2182,15 +2182,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 
 end_unlink:
 	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
 	return retval;
 }
 
-static int ext4_symlink (struct inode * dir,
-		struct dentry *dentry, const char * symname)
+static int ext4_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int l, err, retries = 0;
 
 	l = strlen(symname)+1;
@@ -2207,12 +2207,12 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > sizeof (EXT4_I(inode)->i_data)) {
+	if (l > sizeof(EXT4_I(inode)->i_data)) {
 		inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
@@ -2225,14 +2225,14 @@ retry:
 		if (err) {
 			clear_nlink(inode);
 			ext4_mark_inode_dirty(handle, inode);
-			iput (inode);
+			iput(inode);
 			goto out_stop;
 		}
 	} else {
 		/* clear the extent format for fast symlink */
 		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
 		inode->i_op = &ext4_fast_symlink_inode_operations;
-		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2244,8 +2244,8 @@ out_stop:
 	return err;
 }
 
-static int ext4_link (struct dentry * old_dentry,
-		struct inode * dir, struct dentry *dentry)
+static int ext4_link(struct dentry *old_dentry,
+		     struct inode *dir, struct dentry *dentry)
 {
 	handle_t *handle;
 	struct inode *inode = old_dentry->d_inode;
@@ -2288,13 +2288,13 @@ retry:
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	handle_t *handle;
-	struct inode * old_inode, * new_inode;
-	struct buffer_head * old_bh, * new_bh, * dir_bh;
-	struct ext4_dir_entry_2 * old_de, * new_de;
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh, *dir_bh;
+	struct ext4_dir_entry_2 *old_de, *new_de;
 	int retval;
 
 	old_bh = new_bh = dir_bh = NULL;
@@ -2312,7 +2312,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		handle->h_sync = 1;
 
-	old_bh = ext4_find_entry (old_dentry, &old_de);
+	old_bh = ext4_find_entry(old_dentry, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -2325,32 +2325,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext4_find_entry (new_dentry, &new_de);
+	new_bh = ext4_find_entry(new_dentry, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
-			brelse (new_bh);
+			brelse(new_bh);
 			new_bh = NULL;
 		}
 	}
 	if (S_ISDIR(old_inode->i_mode)) {
 		if (new_inode) {
 			retval = -ENOTEMPTY;
-			if (!empty_dir (new_inode))
+			if (!empty_dir(new_inode))
 				goto end_rename;
 		}
 		retval = -EIO;
-		dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
 		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
-		if (!new_inode && new_dir!=old_dir &&
+		if (!new_inode && new_dir != old_dir &&
 				new_dir->i_nlink >= EXT4_LINK_MAX)
 			goto end_rename;
 	}
 	if (!new_bh) {
-		retval = ext4_add_entry (handle, new_dentry, old_inode);
+		retval = ext4_add_entry(handle, new_dentry, old_inode);
 		if (retval)
 			goto end_rename;
 	} else {
@@ -2437,9 +2437,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 	retval = 0;
 
 end_rename:
-	brelse (dir_bh);
-	brelse (old_bh);
-	brelse (new_bh);
+	brelse(dir_bh);
+	brelse(old_bh);
+	brelse(new_bh);
 	ext4_journal_stop(handle);
 	return retval;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b3d35604ea1..4392e3fd0f0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
 		       gdb_num);
 
-        /*
-         * If we are not using the primary superblock/GDT copy don't resize,
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
          * because the user tools have no way of handling this.  Probably a
          * bad time to do it anyways.
          */
@@ -964,7 +964,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_group_t o_groups_count;
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c10aaf7d83c..7de6ca0c9e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -654,7 +654,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 
 	if (sbi->s_jquota_fmt)
 		seq_printf(seq, ",jqfmt=%s",
-		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
 
 	if (sbi->s_qf_names[USRQUOTA])
 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -822,7 +822,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
 static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -1586,7 +1586,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
 
-	ext4_debug ("Checking group descriptors");
+	ext4_debug("Checking group descriptors");
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc0..0013d52f73b 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
-	nd_set_link(nd, (char*)ei->i_data);
+	nd_set_link(nd, (char *) ei->i_data);
 	return NULL;
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb..814ea58d4d5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
 	(((name_len) + EXT4_XATTR_ROUND + \
 	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
 #define EXT4_XATTR_NEXT(entry) \
-	( (struct ext4_xattr_entry *)( \
-	  (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+	((struct ext4_xattr_entry *)( \
+	 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
 #define EXT4_XATTR_SIZE(size) \
 	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
 
-- 
cgit v1.2.3


From c4a0c46ec92c194c873232b88debce4e1a448483 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 19 Aug 2008 21:08:18 -0400
Subject: ext4: invalidate pages if delalloc block allocation fails.

We are a bit agressive in invalidating all the pages. But
it is ok because we really don't know why the block allocation
failed and it is better to come of the writeback path
so that user can look for more info.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 73 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89c92c0f829..b6fa0c4087e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1783,6 +1783,39 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
 		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
 }
 
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+					sector_t logical, long blk_cnt)
+{
+	int nr_pages, i;
+	pgoff_t index, end;
+	struct pagevec pvec;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end   = (logical + blk_cnt - 1) >>
+				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			block_invalidatepage(page, 0);
+			ClearPageUptodate(page);
+			unlock_page(page);
+		}
+	}
+	return;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -1792,7 +1825,7 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
 	struct buffer_head *lbh = &mpd->lbh;
@@ -1803,7 +1836,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
-		return;
+		return 0;
 
 	new.b_state = lbh->b_state;
 	new.b_blocknr = 0;
@@ -1814,10 +1847,38 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * to write simply return
 	 */
 	if (!new.b_size)
-		return;
+		return 0;
 	err = mpd->get_block(mpd->inode, next, &new, 1);
-	if (err)
-		return;
+	if (err) {
+
+		/* If get block returns with error
+		 * we simply return. Later writepage
+		 * will redirty the page and writepages
+		 * will find the dirty page again
+		 */
+		if (err == -EAGAIN)
+			return 0;
+		/*
+		 * get block failure will cause us
+		 * to loop in writepages. Because
+		 * a_ops->writepage won't be able to
+		 * make progress. The page will be redirtied
+		 * by writepage and writepages will again
+		 * try to write the same.
+		 */
+		printk(KERN_EMERG "%s block allocation failed for inode %lu "
+				  "at logical offset %llu with max blocks "
+				  "%zd with error %d\n",
+				  __func__, mpd->inode->i_ino,
+				  (unsigned long long)next,
+				  lbh->b_size >> mpd->inode->i_blkbits, err);
+		printk(KERN_EMERG "This should not happen.!! "
+					"Data will be lost\n");
+		/* invlaidate all the pages */
+		ext4_da_block_invalidatepages(mpd, next,
+				lbh->b_size >> mpd->inode->i_blkbits);
+		return err;
+	}
 	BUG_ON(new.b_size == 0);
 
 	if (buffer_new(&new))
@@ -1830,7 +1891,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (buffer_delay(lbh) || buffer_unwritten(lbh))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
-	return;
+	return 0;
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1899,8 +1960,8 @@ flush_it:
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
-	mpage_da_map_blocks(mpd);
-	mpage_da_submit_io(mpd);
+	if (mpage_da_map_blocks(mpd) == 0)
+		mpage_da_submit_io(mpd);
 	mpd->io_done = 1;
 	return;
 }
@@ -1942,8 +2003,8 @@ static int __mpage_da_writepage(struct page *page,
 		 * and start IO on them using writepage()
 		 */
 		if (mpd->next_page != mpd->first_page) {
-			mpage_da_map_blocks(mpd);
-			mpage_da_submit_io(mpd);
+			if (mpage_da_map_blocks(mpd) == 0)
+				mpage_da_submit_io(mpd);
 			/*
 			 * skip rest of the page in the page_vec
 			 */
@@ -2046,8 +2107,8 @@ static int mpage_da_writepages(struct address_space *mapping,
 	 * Handle last extent of pages
 	 */
 	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-		mpage_da_map_blocks(&mpd);
-		mpage_da_submit_io(&mpd);
+		if (mpage_da_map_blocks(&mpd) == 0)
+			mpage_da_submit_io(&mpd);
 	}
 
 	wbc->nr_to_write = to_write - mpd.pages_written;
-- 
cgit v1.2.3


From 04da11bfcf511544ae19e0a7e5f994b3237752ac Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 20 Aug 2008 17:16:34 +0300
Subject: UBIFS: fix zero-length truncations

Always allow truncations to zero, even if budgeting thinks there
is no space. UBIFS reserves some space for deletions anyway.

Otherwise, the following happans:
1. create a file, and write as much as possible there, until ENOSPC
2. truncate the file, which fails with ENOSPC, which is not good.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/dir.c  |  1 -
 fs/ubifs/file.c | 20 ++++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c96f1fb701..2b267c9a180 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	if (err) {
 		if (err != -ENOSPC)
 			return err;
-		err = 0;
 		budgeted = 0;
 	}
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 4071d1cae29..3d698e2022b 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	int err;
 	struct ubifs_budget_req req;
 	loff_t old_size = inode->i_size, new_size = attr->ia_size;
-	int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+	int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	/* A funny way to budget for truncation node */
 	req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
 	err = ubifs_budget_space(c, &req);
-	if (err)
-		return err;
+	if (err) {
+		/*
+		 * Treat truncations to zero as deletion and always allow them,
+		 * just like we do for '->unlink()'.
+		 */
+		if (new_size || err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
 
 	err = vmtruncate(inode, new_size);
 	if (err)
@@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
 	mutex_unlock(&ui->ui_mutex);
 out_budg:
-	ubifs_release_budget(c, &req);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		c->nospace = c->nospace_rp = 0;
+		smp_wmb();
+	}
 	return err;
 }
 
-- 
cgit v1.2.3


From 761e29f3bb19b05bea55285dfdf2d28e001a63b8 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Wed, 20 Aug 2008 16:32:40 +0300
Subject: UBIFS: always read hashed-key nodes under TNC mutex

Leaf-nodes that have a hashed key are stored in the
leaf-node-cache (LNC) which is protected by the TNC
mutex.  Consequently, when reading a leaf node with
a hashed key (i.e. directory entries, xattr entries)
the TNC mutex is always required.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/tnc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a9644..4fbc5921688 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1498,7 +1498,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 {
 	int found, n, err;
 	struct ubifs_znode *znode;
-	struct ubifs_zbranch zbr;
 
 	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
 	mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1521,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 		goto out_unlock;
 	}
 
-	zbr = znode->zbranch[n];
-	mutex_unlock(&c->tnc_mutex);
-
-	err = tnc_read_node_nm(c, &zbr, node);
-	return err;
+	err = tnc_read_node_nm(c, &znode->zbranch[n], node);
 
 out_unlock:
 	mutex_unlock(&c->tnc_mutex);
-- 
cgit v1.2.3


From 601c0bc46753007be011b513ba4fc50ed8e30aef Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 22 Aug 2008 14:23:35 +0300
Subject: UBIFS: allow for racing between GC and TNC

The TNC mutex is unlocked prematurely when reading leaf nodes
with non-hashed keys.  This is unsafe because the node may be
moved by garbage collection and the eraseblock unmapped, although
that has never actually happened during stress testing.

This patch fixes the flaw by detecting the race and retrying with
the TNC mutex locked.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/gc.c    |   6 +++
 fs/ubifs/misc.h  |  17 +++++++++
 fs/ubifs/tnc.c   | 109 ++++++++++++++++++++++++++++++-------------------------
 fs/ubifs/ubifs.h |   6 ++-
 4 files changed, 87 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac2908..13f1019c859 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -344,6 +344,12 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 		if (err)
 			goto out;
 
+		/* Allow for races with TNC */
+		c->gced_lnum = lnum;
+		smp_wmb();
+		c->gc_seq += 1;
+		smp_wmb();
+
 		if (c->gc_lnum == -1) {
 			c->gc_lnum = lnum;
 			err = LEB_RETAINED;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 87dabf9fe74..87ced4c74a6 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -325,4 +325,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
 		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+				   const union ubifs_key *key, void *node)
+{
+	return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
+
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 4fbc5921688..7da209ab937 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
 		if (keys_cmp(c, key, &node_key) != 0)
 			ret = 0;
 	}
-	if (ret == 0)
+	if (ret == 0 && c->replaying)
 		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
 			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
 	return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
 }
 
 /**
- * ubifs_tnc_lookup - look up a file-system node.
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
  * @c: UBIFS file-system description object
- * @key: node key to lookup
- * @node: the node is returned here
+ * @lnum: LEB number
+ * @gc_seq1: garbage collection sequence number
  *
- * This function look up and reads node with key @key. The caller has to make
- * sure the @node buffer is large enough to fit the node. Returns zero in case
- * of success, %-ENOENT if the node was not found, and a negative error code in
- * case of failure.
+ * This function determines if @lnum may have been garbage collected since
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
+ * %0 is returned.
  */
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-		     void *node)
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
 {
-	int found, n, err;
-	struct ubifs_znode *znode;
-	struct ubifs_zbranch zbr, *zt;
-
-	mutex_lock(&c->tnc_mutex);
-	found = ubifs_lookup_level0(c, key, &znode, &n);
-	if (!found) {
-		err = -ENOENT;
-		goto out;
-	} else if (found < 0) {
-		err = found;
-		goto out;
-	}
-	zt = &znode->zbranch[n];
-	if (is_hash_key(c, key)) {
-		/*
-		 * In this case the leaf node cache gets used, so we pass the
-		 * address of the zbranch and keep the mutex locked
-		 */
-		err = tnc_read_node_nm(c, zt, node);
-		goto out;
-	}
-	zbr = znode->zbranch[n];
-	mutex_unlock(&c->tnc_mutex);
-
-	err = ubifs_tnc_read_node(c, &zbr, node);
-	return err;
+	int gc_seq2, gced_lnum;
 
-out:
-	mutex_unlock(&c->tnc_mutex);
-	return err;
+	gced_lnum = c->gced_lnum;
+	smp_rmb();
+	gc_seq2 = c->gc_seq;
+	/* Same seq means no GC */
+	if (gc_seq1 == gc_seq2)
+		return 0;
+	/* Different by more than 1 means we don't know */
+	if (gc_seq1 + 1 != gc_seq2)
+		return 1;
+	/*
+	 * We have seen the sequence number has increased by 1. Now we need to
+	 * be sure we read the right LEB number, so read it again.
+	 */
+	smp_rmb();
+	if (gced_lnum != c->gced_lnum)
+		return 1;
+	/* Finally we can check lnum */
+	if (gced_lnum == lnum)
+		return 1;
+	return 0;
 }
 
 /**
@@ -1436,16 +1425,19 @@ out:
  * @lnum: LEB number is returned here
  * @offs: offset is returned here
  *
- * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
- * location also. See 'ubifs_tnc_lookup()'.
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
  */
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
 		     void *node, int *lnum, int *offs)
 {
-	int found, n, err;
+	int found, n, err, safely = 0, gc_seq1;
 	struct ubifs_znode *znode;
 	struct ubifs_zbranch zbr, *zt;
 
+again:
 	mutex_lock(&c->tnc_mutex);
 	found = ubifs_lookup_level0(c, key, &znode, &n);
 	if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
 		goto out;
 	}
 	zt = &znode->zbranch[n];
+	if (lnum) {
+		*lnum = zt->lnum;
+		*offs = zt->offs;
+	}
 	if (is_hash_key(c, key)) {
 		/*
 		 * In this case the leaf node cache gets used, so we pass the
 		 * address of the zbranch and keep the mutex locked
 		 */
-		*lnum = zt->lnum;
-		*offs = zt->offs;
 		err = tnc_read_node_nm(c, zt, node);
 		goto out;
 	}
+	if (safely) {
+		err = ubifs_tnc_read_node(c, zt, node);
+		goto out;
+	}
+	/* Drop the TNC mutex prematurely and race with garbage collection */
 	zbr = znode->zbranch[n];
+	gc_seq1 = c->gc_seq;
 	mutex_unlock(&c->tnc_mutex);
 
-	*lnum = zbr.lnum;
-	*offs = zbr.offs;
+	if (ubifs_get_wbuf(c, zbr.lnum)) {
+		/* We do not GC journal heads */
+		err = ubifs_tnc_read_node(c, &zbr, node);
+		return err;
+	}
 
-	err = ubifs_tnc_read_node(c, &zbr, node);
-	return err;
+	err = fallible_read_node(c, key, &zbr, node);
+	if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+		/*
+		 * The node may have been GC'ed out from under us so try again
+		 * while keeping the TNC mutex locked.
+		 */
+		safely = 1;
+		goto again;
+	}
+	return 0;
 
 out:
 	mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d7f706f7a30..7828d69ca4f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1028,6 +1028,8 @@ struct ubifs_mount_opts {
  * @sbuf: a buffer of LEB size used by GC and replay for scanning
  * @idx_gc: list of index LEBs that have been garbage collected
  * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
  *
  * @infos_list: links all 'ubifs_info' objects
  * @umount_mutex: serializes shrinker and un-mount
@@ -1257,6 +1259,8 @@ struct ubifs_info {
 	void *sbuf;
 	struct list_head idx_gc;
 	int idx_gc_cnt;
+	volatile int gc_seq;
+	volatile int gced_lnum;
 
 	struct list_head infos_list;
 	struct mutex umount_mutex;
@@ -1451,8 +1455,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
 /* tnc.c */
 int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 			struct ubifs_znode **zn, int *n);
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-		     void *node);
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 			void *node, const struct qstr *nm);
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
-- 
cgit v1.2.3


From 0188d6c5807b65e2e20dcb75a668efbe5418b27e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 Aug 2008 09:38:26 +0100
Subject: GFS2: Fix & clean up GFS2 rename

This patch fixes a locking issue in the rename code by ensuring that we hold
the per sb rename lock over both directory and "other" renames which involve
different parent directories.

At the same time, this moved the (only called from one place) function
gfs2_ok_to_move into the file that its called from, so we can mark it
static. This should make a code a bit easier to follow.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Peter Staubach <staubach@redhat.com>
---
 fs/gfs2/inode.c     | 48 ------------------------------------
 fs/gfs2/inode.h     |  1 -
 fs/gfs2/ops_inode.c | 71 ++++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8b0806a3294..87525523446 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1140,54 +1140,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	return 0;
 }
 
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
-	struct inode *dir = &to->i_inode;
-	struct super_block *sb = dir->i_sb;
-	struct inode *tmp;
-	struct qstr dotdot;
-	int error = 0;
-
-	gfs2_str2qstr(&dotdot, "..");
-
-	igrab(dir);
-
-	for (;;) {
-		if (dir == &this->i_inode) {
-			error = -EINVAL;
-			break;
-		}
-		if (dir == sb->s_root->d_inode) {
-			error = 0;
-			break;
-		}
-
-		tmp = gfs2_lookupi(dir, &dotdot, 1);
-		if (IS_ERR(tmp)) {
-			error = PTR_ERR(tmp);
-			break;
-		}
-
-		iput(dir);
-		dir = tmp;
-	}
-
-	iput(dir);
-
-	return error;
-}
-
 /**
  * gfs2_readlinki - return the contents of a symlink
  * @ip: the symlink's inode
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 58f9607d6a8..bfd2afc0c90 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,7 +91,6 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   const struct gfs2_inode *ip);
 int gfs2_permission(struct inode *inode, int mask);
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
 int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 35f6f032a02..534e1e2c65c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -589,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	return 0;
 }
 
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+	struct inode *dir = &to->i_inode;
+	struct super_block *sb = dir->i_sb;
+	struct inode *tmp;
+	struct qstr dotdot;
+	int error = 0;
+
+	gfs2_str2qstr(&dotdot, "..");
+
+	igrab(dir);
+
+	for (;;) {
+		if (dir == &this->i_inode) {
+			error = -EINVAL;
+			break;
+		}
+		if (dir == sb->s_root->d_inode) {
+			error = 0;
+			break;
+		}
+
+		tmp = gfs2_lookupi(dir, &dotdot, 1);
+		if (IS_ERR(tmp)) {
+			error = PTR_ERR(tmp);
+			break;
+		}
+
+		iput(dir);
+		dir = tmp;
+	}
+
+	iput(dir);
+
+	return error;
+}
+
 /**
  * gfs2_rename - Rename a file
  * @odir: Parent directory of old file name
@@ -607,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh;
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -621,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
-	/* Make sure we aren't trying to move a dirctory into it's subdir */
 
-	if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
-		dir_rename = 1;
-
-		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0,
-					   &r_gh);
+	if (odip != ndip) {
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+					   0, &r_gh);
 		if (error)
 			goto out;
 
-		error = gfs2_ok_to_move(ip, ndip);
-		if (error)
-			goto out_gunlock_r;
+		if (S_ISDIR(ip->i_inode.i_mode)) {
+			dir_rename = 1;
+			/* don't move a dirctory into it's subdir */
+			error = gfs2_ok_to_move(ip, ndip);
+			if (error)
+				goto out_gunlock_r;
+		}
 	}
 
 	num_gh = 1;
@@ -829,7 +878,7 @@ out_gunlock:
 		gfs2_holder_uninit(ghs + x);
 	}
 out_gunlock_r:
-	if (dir_rename)
+	if (r_gh.gh_gl)
 		gfs2_glock_dq_uninit(&r_gh);
 out:
 	return error;
-- 
cgit v1.2.3


From 0f8e0d9a317406612700426fad3efab0b7bbc467 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 6 Aug 2008 13:30:24 -0500
Subject: dlm: allow multiple lockspace creates

Add a count for lockspace create and release so that create can
be called multiple times to use the lockspace from different places.
Also add the new flag DLM_LSFL_NEWEXCL to create a lockspace with
the previous behavior of returning -EEXIST if the lockspace already
exists.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h       |   6 ++-
 fs/dlm/lockspace.c          | 107 +++++++++++++++++++++++++++++---------------
 fs/dlm/user.c               |  54 +++++++++++++---------
 fs/dlm/user.h               |   3 +-
 fs/gfs2/locking/dlm/mount.c |   3 +-
 5 files changed, 111 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629..9e0622aff49 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -441,7 +441,9 @@ struct dlm_ls {
 	uint32_t		ls_global_id;	/* global unique lockspace ID */
 	uint32_t		ls_exflags;
 	int			ls_lvblen;
-	int			ls_count;	/* reference count */
+	int			ls_count;	/* refcount of processes in
+						   the dlm using this ls */
+	int			ls_create_count; /* create/release refcount */
 	unsigned long		ls_flags;	/* LSFL_ */
 	struct kobject		ls_kobj;
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e9..56eae4e4a95 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
 #include "lock.h"
 #include "recover.h"
 #include "requestqueue.h"
+#include "user.h"
 
 static int			ls_count;
 static struct mutex		ls_lock;
@@ -246,23 +247,6 @@ static void dlm_scand_stop(void)
 	kthread_stop(scand_task);
 }
 
-static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
-{
-	struct dlm_ls *ls;
-
-	spin_lock(&lslist_lock);
-
-	list_for_each_entry(ls, &lslist, ls_list) {
-		if (ls->ls_namelen == namelen &&
-		    memcmp(ls->ls_name, name, namelen) == 0)
-			goto out;
-	}
-	ls = NULL;
- out:
-	spin_unlock(&lslist_lock);
-	return ls;
-}
-
 struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
 {
 	struct dlm_ls *ls;
@@ -327,6 +311,7 @@ static void remove_lockspace(struct dlm_ls *ls)
 	for (;;) {
 		spin_lock(&lslist_lock);
 		if (ls->ls_count == 0) {
+			WARN_ON(ls->ls_create_count != 0);
 			list_del(&ls->ls_list);
 			spin_unlock(&lslist_lock);
 			return;
@@ -381,7 +366,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 			 uint32_t flags, int lvblen)
 {
 	struct dlm_ls *ls;
-	int i, size, error = -ENOMEM;
+	int i, size, error;
 	int do_unreg = 0;
 
 	if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +378,32 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	if (!try_module_get(THIS_MODULE))
 		return -EINVAL;
 
-	ls = dlm_find_lockspace_name(name, namelen);
-	if (ls) {
-		*lockspace = ls;
+	error = 0;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		WARN_ON(ls->ls_create_count <= 0);
+		if (ls->ls_namelen != namelen)
+			continue;
+		if (memcmp(ls->ls_name, name, namelen))
+			continue;
+		if (flags & DLM_LSFL_NEWEXCL) {
+			error = -EEXIST;
+			break;
+		}
+		ls->ls_create_count++;
 		module_put(THIS_MODULE);
-		return -EEXIST;
+		error = 1; /* not an error, return 0 */
+		break;
 	}
+	spin_unlock(&lslist_lock);
+
+	if (error < 0)
+		goto out;
+	if (error)
+		goto ret_zero;
+
+	error = -ENOMEM;
 
 	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
 	if (!ls)
@@ -418,8 +423,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 		ls->ls_allocation = GFP_KERNEL;
 
 	/* ls_exflags are forced to match among nodes, and we don't
-	   need to require all nodes to have TIMEWARN or FS set */
-	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS));
+	   need to require all nodes to have some flags set */
+	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+				    DLM_LSFL_NEWEXCL));
 
 	size = dlm_config.ci_rsbtbl_size;
 	ls->ls_rsbtbl_size = size;
@@ -510,6 +516,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	down_write(&ls->ls_in_recovery);
 
 	spin_lock(&lslist_lock);
+	ls->ls_create_count = 1;
 	list_add(&ls->ls_list, &lslist);
 	spin_unlock(&lslist_lock);
 
@@ -548,7 +555,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	dlm_create_debug_file(ls);
 
 	log_debug(ls, "join complete");
-
+ ret_zero:
 	*lockspace = ls;
 	return 0;
 
@@ -635,11 +642,32 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	struct dlm_lkb *lkb;
 	struct dlm_rsb *rsb;
 	struct list_head *head;
-	int i;
-	int busy = lockspace_busy(ls);
+	int i, busy, rv;
+
+	busy = lockspace_busy(ls);
+
+	spin_lock(&lslist_lock);
+	if (ls->ls_create_count == 1) {
+		if (busy > force)
+			rv = -EBUSY;
+		else {
+			/* remove_lockspace takes ls off lslist */
+			ls->ls_create_count = 0;
+			rv = 0;
+		}
+	} else if (ls->ls_create_count > 1) {
+		rv = --ls->ls_create_count;
+	} else {
+		rv = -EINVAL;
+	}
+	spin_unlock(&lslist_lock);
+
+	if (rv) {
+		log_debug(ls, "release_lockspace no remove %d", rv);
+		return rv;
+	}
 
-	if (busy > force)
-		return -EBUSY;
+	dlm_device_deregister(ls);
 
 	if (force < 3)
 		do_uevent(ls, 0);
@@ -720,15 +748,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	dlm_clear_members(ls);
 	dlm_clear_members_gone(ls);
 	kfree(ls->ls_node_array);
+	log_debug(ls, "release_lockspace final free");
 	kobject_put(&ls->ls_kobj);
 	/* The ls structure will be freed when the kobject is done with */
 
-	mutex_lock(&ls_lock);
-	ls_count--;
-	if (!ls_count)
-		threads_stop();
-	mutex_unlock(&ls_lock);
-
 	module_put(THIS_MODULE);
 	return 0;
 }
@@ -750,11 +773,21 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 int dlm_release_lockspace(void *lockspace, int force)
 {
 	struct dlm_ls *ls;
+	int error;
 
 	ls = dlm_find_lockspace_local(lockspace);
 	if (!ls)
 		return -EINVAL;
 	dlm_put_lockspace(ls);
-	return release_lockspace(ls, force);
+
+	mutex_lock(&ls_lock);
+	error = release_lockspace(ls, force);
+	if (!error)
+		ls_count--;
+	else if (!ls_count)
+		threads_stop();
+	mutex_unlock(&ls_lock);
+
+	return error;
 }
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 34f14a14fb4..6542110c0da 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -340,10 +340,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
 	return error;
 }
 
-static int create_misc_device(struct dlm_ls *ls, char *name)
+static int dlm_device_register(struct dlm_ls *ls, char *name)
 {
 	int error, len;
 
+	/* The device is already registered.  This happens when the
+	   lockspace is created multiple times from userspace. */
+	if (ls->ls_device.name)
+		return 0;
+
 	error = -ENOMEM;
 	len = strlen(name) + strlen(name_prefix) + 2;
 	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -363,6 +368,22 @@ fail:
 	return error;
 }
 
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+	int error;
+
+	/* The device is not registered.  This happens when the lockspace
+	   was never used from userspace, or when device_create_lockspace()
+	   calls dlm_release_lockspace() after the register fails. */
+	if (!ls->ls_device.name)
+		return 0;
+
+	error = misc_deregister(&ls->ls_device);
+	if (!error)
+		kfree(ls->ls_device.name);
+	return error;
+}
+
 static int device_user_purge(struct dlm_user_proc *proc,
 			     struct dlm_purge_params *params)
 {
@@ -397,7 +418,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 		return -ENOENT;
 
-	error = create_misc_device(ls, params->name);
+	error = dlm_device_register(ls, params->name);
 	dlm_put_lockspace(ls);
 
 	if (error)
@@ -421,31 +442,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 		return -ENOENT;
 
-	/* Deregister the misc device first, so we don't have
-	 * a device that's not attached to a lockspace. If
-	 * dlm_release_lockspace fails then we can recreate it
-	 */
-	error = misc_deregister(&ls->ls_device);
-	if (error) {
-		dlm_put_lockspace(ls);
-		goto out;
-	}
-	kfree(ls->ls_device.name);
-
 	if (params->flags & DLM_USER_LSFLG_FORCEFREE)
 		force = 2;
 
 	lockspace = ls->ls_local_handle;
+	dlm_put_lockspace(ls);
 
-	/* dlm_release_lockspace waits for references to go to zero,
-	   so all processes will need to close their device for the ls
-	   before the release will procede */
+	/* The final dlm_release_lockspace waits for references to go to
+	   zero, so all processes will need to close their device for the
+	   ls before the release will proceed.  release also calls the
+	   device_deregister above.  Converting a positive return value
+	   from release to zero means that userspace won't know when its
+	   release was the final one, but it shouldn't need to know. */
 
-	dlm_put_lockspace(ls);
 	error = dlm_release_lockspace(lockspace, force);
-	if (error)
-		create_misc_device(ls, ls->ls_name);
- out:
+	if (error > 0)
+		error = 0;
 	return error;
 }
 
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e415..c528b6b2991 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,6 @@
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
 int dlm_user_init(void);
 void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
 
 #endif
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 09d78c216f4..0c4cbe6c828 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
 
 	error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
 				  &ls->dlm_lockspace,
-				  DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0),
+				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+				  (nodir ? DLM_LSFL_NODIR : 0),
 				  GDLM_LVB_SIZE);
 	if (error) {
 		log_error("dlm_new_lockspace error %d", error);
-- 
cgit v1.2.3


From dc68c7ed362a00a48290252573a8eb9f74463c3a Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 18 Aug 2008 11:43:30 -0500
Subject: dlm: detect available userspace daemon

If dlm_controld (the userspace daemon that controls the setup and
recovery of the dlm) fails, the kernel should shut down the lockspaces
in the kernel rather than leaving them running.  This is detected by
having dlm_controld hold a misc device open while running, and if
the kernel detects a close while the daemon is still needed, it stops
the lockspaces in the kernel.

Knowing that the userspace daemon isn't running also allows the
lockspace create/remove routines to avoid waiting on the daemon
for join/leave operations.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 24 ++++++++++++++++++++-
 fs/dlm/lockspace.h |  1 +
 fs/dlm/user.c      | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/dlm/user.h      |  1 +
 4 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 56eae4e4a95..ba672fe0a60 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -378,6 +378,11 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	if (!try_module_get(THIS_MODULE))
 		return -EINVAL;
 
+	if (!dlm_user_daemon_available()) {
+		module_put(THIS_MODULE);
+		return -EUNATCH;
+	}
+
 	error = 0;
 
 	spin_lock(&lslist_lock);
@@ -669,7 +674,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 
 	dlm_device_deregister(ls);
 
-	if (force < 3)
+	if (force < 3 && dlm_user_daemon_available())
 		do_uevent(ls, 0);
 
 	dlm_recoverd_stop(ls);
@@ -791,3 +796,20 @@ int dlm_release_lockspace(void *lockspace, int force)
 	return error;
 }
 
+void dlm_stop_lockspaces(void)
+{
+	struct dlm_ls *ls;
+
+ restart:
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+			continue;
+		spin_unlock(&lslist_lock);
+		log_error(ls, "no userland control daemon, stopping lockspace");
+		dlm_ls_stop(ls);
+		goto restart;
+	}
+	spin_unlock(&lslist_lock);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd02..f879f87901f 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
 struct dlm_ls *dlm_find_lockspace_local(void *id);
 struct dlm_ls *dlm_find_lockspace_device(int minor);
 void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
 
 #endif				/* __LOCKSPACE_DOT_H__ */
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 6542110c0da..81627b502a5 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -27,6 +27,8 @@
 
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
 
 #ifdef CONFIG_COMPAT
 
@@ -890,6 +892,26 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 	return 0;
 }
 
+int dlm_user_daemon_available(void)
+{
+	/* dlm_controld hasn't started (or, has started, but not
+	   properly populated configfs) */
+
+	if (!dlm_our_nodeid())
+		return 0;
+
+	/* This is to deal with versions of dlm_controld that don't
+	   know about the monitor device.  We assume that if the
+	   dlm_controld was started (above), but the monitor device
+	   was never opened, that it's an old version.  dlm_controld
+	   should open the monitor device before populating configfs. */
+
+	if (dlm_monitor_unused)
+		return 1;
+
+	return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
+
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
 	cycle_kernel_lock();
@@ -902,6 +924,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+	atomic_inc(&dlm_monitor_opened);
+	dlm_monitor_unused = 0;
+	return 0;
+}
+
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+	if (atomic_dec_and_test(&dlm_monitor_opened))
+		dlm_stop_lockspaces();
+	return 0;
+}
+
 static const struct file_operations device_fops = {
 	.open    = device_open,
 	.release = device_close,
@@ -925,19 +961,42 @@ static struct miscdevice ctl_device = {
 	.minor = MISC_DYNAMIC_MINOR,
 };
 
+static const struct file_operations monitor_device_fops = {
+	.open    = monitor_device_open,
+	.release = monitor_device_close,
+	.owner   = THIS_MODULE,
+};
+
+static struct miscdevice monitor_device = {
+	.name  = "dlm-monitor",
+	.fops  = &monitor_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
 int __init dlm_user_init(void)
 {
 	int error;
 
+	atomic_set(&dlm_monitor_opened, 0);
+
 	error = misc_register(&ctl_device);
-	if (error)
+	if (error) {
 		log_print("misc_register failed for control device");
+		goto out;
+	}
 
+	error = misc_register(&monitor_device);
+	if (error) {
+		log_print("misc_register failed for monitor device");
+		misc_deregister(&ctl_device);
+	}
+ out:
 	return error;
 }
 
 void dlm_user_exit(void)
 {
 	misc_deregister(&ctl_device);
+	misc_deregister(&monitor_device);
 }
 
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index c528b6b2991..35eb6a13d61 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -13,5 +13,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
 
 #endif
-- 
cgit v1.2.3


From c1dcf65ffc5796bf4ff75c13f448e63b3a416fd6 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 18 Aug 2008 14:03:25 -0500
Subject: dlm: fix locking of lockspace list in dlm_scand

The dlm_scand thread needs to lock the list of lockspaces
when going through it.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h |  1 +
 fs/dlm/lockspace.c    | 27 +++++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 9e0622aff49..868e4c9ef12 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -445,6 +445,7 @@ struct dlm_ls {
 						   the dlm using this ls */
 	int			ls_create_count; /* create/release refcount */
 	unsigned long		ls_flags;	/* LSFL_ */
+	unsigned long		ls_scan_time;
 	struct kobject		ls_kobj;
 
 	struct dlm_rsbtable	*ls_rsbtbl;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index ba672fe0a60..d910501de6d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -212,19 +212,41 @@ void dlm_lockspace_exit(void)
 	kset_unregister(dlm_kset);
 }
 
+static struct dlm_ls *find_ls_to_scan(void)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (time_after_eq(jiffies, ls->ls_scan_time +
+					    dlm_config.ci_scan_secs * HZ)) {
+			spin_unlock(&lslist_lock);
+			return ls;
+		}
+	}
+	spin_unlock(&lslist_lock);
+	return NULL;
+}
+
 static int dlm_scand(void *data)
 {
 	struct dlm_ls *ls;
+	int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
 
 	while (!kthread_should_stop()) {
-		list_for_each_entry(ls, &lslist, ls_list) {
+		ls = find_ls_to_scan();
+		if (ls) {
 			if (dlm_lock_recovery_try(ls)) {
+				ls->ls_scan_time = jiffies;
 				dlm_scan_rsbs(ls);
 				dlm_scan_timeout(ls);
 				dlm_unlock_recovery(ls);
+			} else {
+				ls->ls_scan_time += HZ;
 			}
+		} else {
+			schedule_timeout_interruptible(timeout_jiffies);
 		}
-		schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
 	}
 	return 0;
 }
@@ -418,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	ls->ls_lvblen = lvblen;
 	ls->ls_count = 0;
 	ls->ls_flags = 0;
+	ls->ls_scan_time = jiffies;
 
 	if (flags & DLM_LSFL_TIMEWARN)
 		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
-- 
cgit v1.2.3


From 8191e1fa8131a422f4bf7b0f2dc1f8543fd17783 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 22 Aug 2008 12:26:40 +0300
Subject: UBIFS: do not update min_idx_lebs in stafs

This is bad because the rest of the code should not depend on it,
and this may hide bugss, instead of revealing them.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 15409815747..ac0d2e1e73b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -741,7 +741,6 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c)
 
 	available = ubifs_calc_available(c, min_idx_lebs);
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	c->min_idx_lebs = min_idx_lebs;
 	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
-- 
cgit v1.2.3


From 9e5de3549615818cae9c20a0ee1fd3ad4a747758 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 25 Aug 2008 17:29:43 +0300
Subject: UBIFS: push empty flash hack down

We have a hack which forces the amount of flash space to be
equivalent to 'c->blocks_cnt' in case of empty FS. This is
to make users happy and see '%0' used in 'df' when they
mount an empty FS. This hack is not needed in
'ubifs_calc_available()', but it is only needed the caller,
in 'ubifs_budg_get_free_space()'. So push it down there.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 24 +++++++++++-------------
 fs/ubifs/super.c  |  2 --
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index ac0d2e1e73b..f6d2eaa7a06 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
 	int subtract_lebs;
 	long long available;
 
-	/*
-	 * Force the amount available to the total size reported if the used
-	 * space is zero.
-	 */
-	if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
-	    c->budg_data_growth + c->budg_dd_growth == 0) {
-		/* Do the same calculation as for c->block_cnt */
-		available = c->main_lebs - 2;
-		available *= c->leb_size - c->dark_wm;
-		return available;
-	}
-
 	available = c->main_bytes - c->lst.total_used;
 
 	/*
@@ -739,8 +727,18 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c)
 		return 0;
 	}
 
-	available = ubifs_calc_available(c, min_idx_lebs);
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
+
+	/*
+	 * Force the amount available to the total size reported if the used
+	 * space is zero.
+	 */
+	if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
+		spin_unlock(&c->space_lock);
+		return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
+	}
+
+	available = ubifs_calc_available(c, min_idx_lebs);
 	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f71e6b8822c..1018053519e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -649,8 +649,6 @@ static int init_constants_late(struct ubifs_info *c)
 	 *
 	 * Subtract the LEB reserved for GC and the LEB which is reserved for
 	 * deletions.
-	 *
-	 * Review 'ubifs_calc_available()' if changing this calculation.
 	 */
 	tmp64 = c->main_lebs - 2;
 	tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
-- 
cgit v1.2.3


From 8aabb75017291ba68c09ff5fdb998ef0a1fdaaf9 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 25 Aug 2008 16:02:31 +0300
Subject: UBIFS: remove incorrect index space check

When we report free space to user-space, we should not report
0 if the amount of empty LEBs is too low, because they would
be produced by GC when needed. Thus, just call
'ubifs_calc_available()' straight away which would take
'min_idx_lebs' into account anyway.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f6d2eaa7a06..9ef630a594c 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -709,24 +709,11 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
  */
 long long ubifs_budg_get_free_space(struct ubifs_info *c)
 {
-	int min_idx_lebs, rsvd_idx_lebs;
+	int min_idx_lebs;
 	long long available, outstanding, free;
 
-	/* Do exactly the same calculations as in 'do_budget_space()' */
 	spin_lock(&c->space_lock);
 	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-
-	if (min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
-	else
-		rsvd_idx_lebs = 0;
-
-	if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
-				- c->lst.taken_empty_lebs) {
-		spin_unlock(&c->space_lock);
-		return 0;
-	}
-
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
 
 	/*
-- 
cgit v1.2.3


From 4b5f2762ec914c9dfd0e9d2377c0574f2ee9a8f9 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 25 Aug 2008 16:15:56 +0300
Subject: UBIFS: improve statfs reporting

Make free space calculation less pessimistic and more realistic,
which in turn improves 'statfs()' reports. Now it lies by 10%-20%,
instead of 20%-30% (10% more honest).

Results of "freespace" test (120MiB volume, 16KiB LEB size,
512 bytes page size). Before the change:

freespace: Test 1: fill the space we have 3 times
freespace: was free: 78274560 bytes 74.6 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 18214912 bytes 17.4 MiB, wrote 23.3% more than predicted
freespace: was free: 76754944 bytes 73.2 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 19738624 bytes 18.8 MiB, wrote 25.7% more than predicted
freespace: was free: 76759040 bytes 73.2 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 19730432 bytes 18.8 MiB, wrote 25.7% more than predicted
freespace: Test 1 finished

freespace: Test 2: gradually lessen amount of free space and fill the FS
freespace: do 10 steps, lessen free space by 6977722 bytes 6.7 MiB each time
freespace: was free: 72273920 bytes 68.9 MiB, wrote: 88891392 bytes 84.8 MiB, delta: 16617472 bytes 15.8 MiB, wrote 23.0% more than predicted
freespace: was free: 66154496 bytes 63.1 MiB, wrote: 81506304 bytes 77.7 MiB, delta: 15351808 bytes 14.6 MiB, wrote 23.2% more than predicted
freespace: was free: 58732544 bytes 56.0 MiB, wrote: 72572928 bytes 69.2 MiB, delta: 13840384 bytes 13.2 MiB, wrote 23.6% more than predicted
freespace: was free: 51552256 bytes 49.2 MiB, wrote: 63754240 bytes 60.8 MiB, delta: 12201984 bytes 11.6 MiB, wrote 23.7% more than predicted
freespace: was free: 44404736 bytes 42.3 MiB, wrote: 54943744 bytes 52.4 MiB, delta: 10539008 bytes 10.1 MiB, wrote 23.7% more than predicted
freespace: was free: 37285888 bytes 35.6 MiB, wrote: 46161920 bytes 44.0 MiB, delta: 8876032 bytes 8.5 MiB, wrote 23.8% more than predicted
freespace: was free: 30171136 bytes 28.8 MiB, wrote: 37384192 bytes 35.7 MiB, delta: 7213056 bytes 6.9 MiB, wrote 23.9% more than predicted
freespace: was free: 23048192 bytes 22.0 MiB, wrote: 28606464 bytes 27.3 MiB, delta: 5558272 bytes 5.3 MiB, wrote 24.1% more than predicted
freespace: was free: 15941632 bytes 15.2 MiB, wrote: 19828736 bytes 18.9 MiB, delta: 3887104 bytes 3.7 MiB, wrote 24.4% more than predicted
freespace: was free: 8830976 bytes 8.4 MiB, wrote: 11063296 bytes 10.6 MiB, delta: 2232320 bytes 2.1 MiB, wrote 25.3% more than predicted
freespace: Test 2 finished

freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS
freespace: do 10 steps, lessen free space by 6985541 bytes 6.7 MiB each time
freespace: trashing: was free: 76840960 bytes 73.3 MiB, need free: 6985550 bytes 6.7 MiB, files created: 248311, delete 225737 (90.9% of them)
freespace: was free: 65228800 bytes 62.2 MiB, wrote: 82530304 bytes 78.7 MiB, delta: 17301504 bytes 16.5 MiB, wrote 26.5% more than predicted
freespace: trashing: was free: 74485760 bytes 71.0 MiB, need free: 13971091 bytes 13.3 MiB, files created: 248712, delete 202061 (81.2% of them)
freespace: was free: 55025664 bytes 52.5 MiB, wrote: 71925760 bytes 68.6 MiB, delta: 16900096 bytes 16.1 MiB, wrote 30.7% more than predicted
freespace: trashing: was free: 75550720 bytes 72.1 MiB, need free: 20956632 bytes 20.0 MiB, files created: 248849, delete 179822 (72.3% of them)
freespace: was free: 46669824 bytes 44.5 MiB, wrote: 63197184 bytes 60.3 MiB, delta: 16527360 bytes 15.8 MiB, wrote 35.4% more than predicted
freespace: trashing: was free: 76214272 bytes 72.7 MiB, need free: 27942173 bytes 26.6 MiB, files created: 248789, delete 157576 (63.3% of them)
freespace: was free: 39129088 bytes 37.3 MiB, wrote: 55164928 bytes 52.6 MiB, delta: 16035840 bytes 15.3 MiB, wrote 41.0% more than predicted
freespace: trashing: was free: 77398016 bytes 73.8 MiB, need free: 34927714 bytes 33.3 MiB, files created: 248711, delete 136474 (54.9% of them)
freespace: was free: 32325632 bytes 30.8 MiB, wrote: 48234496 bytes 46.0 MiB, delta: 15908864 bytes 15.2 MiB, wrote 49.2% more than predicted
freespace: trashing: was free: 75796480 bytes 72.3 MiB, need free: 41913255 bytes 40.0 MiB, files created: 248674, delete 111164 (44.7% of them)
freespace: was free: 25079808 bytes 23.9 MiB, wrote: 40775680 bytes 38.9 MiB, delta: 15695872 bytes 15.0 MiB, wrote 62.6% more than predicted
freespace: trashing: was free: 78209024 bytes 74.6 MiB, need free: 48898796 bytes 46.6 MiB, files created: 248708, delete 93207 (37.5% of them)
freespace: was free: 20582400 bytes 19.6 MiB, wrote: 34844672 bytes 33.2 MiB, delta: 14262272 bytes 13.6 MiB, wrote 69.3% more than predicted
freespace: trashing: was free: 77328384 bytes 73.7 MiB, need free: 55884337 bytes 53.3 MiB, files created: 248644, delete 68951 (27.7% of them)
freespace: was free: 14368768 bytes 13.7 MiB, wrote: 28278784 bytes 27.0 MiB, delta: 13910016 bytes 13.3 MiB, wrote 96.8% more than predicted
freespace: trashing: was free: 77434880 bytes 73.8 MiB, need free: 62869878 bytes 60.0 MiB, files created: 248640, delete 46767 (18.8% of them)
freespace: was free: 8286208 bytes 7.9 MiB, wrote: 21811200 bytes 20.8 MiB, delta: 13524992 bytes 12.9 MiB, wrote 163.2% more than predicted
freespace: trashing: was free: 77856768 bytes 74.2 MiB, need free: 69855419 bytes 66.6 MiB, files created: 248576, delete 25546 (10.3% of them)
freespace: was free: 5570560 bytes 5.3 MiB, wrote: 8187904 bytes 7.8 MiB, delta: 2617344 bytes 2.5 MiB, wrote 47.0% more than predicted
freespace: Test 3 finished

freespace: finished successfully

After the change:

freespace: Test 1: fill the space we have 3 times
freespace: was free: 85204992 bytes 81.3 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 11284480 bytes 10.8 MiB, wrote 13.2% more than predicted
freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 12935168 bytes 12.3 MiB, wrote 15.5% more than predicted
freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 12939264 bytes 12.3 MiB, wrote 15.5% more than predicted
freespace: Test 1 finished

freespace: Test 2: gradually lessen amount of free space and fill the FS
freespace: do 10 steps, lessen free space by 7596218 bytes 7.2 MiB each time
freespace: was free: 78675968 bytes 75.0 MiB, wrote: 88903680 bytes 84.8 MiB, delta: 10227712 bytes 9.8 MiB, wrote 13.0% more than predicted
freespace: was free: 72015872 bytes 68.7 MiB, wrote: 81514496 bytes 77.7 MiB, delta: 9498624 bytes 9.1 MiB, wrote 13.2% more than predicted
freespace: was free: 63938560 bytes 61.0 MiB, wrote: 72589312 bytes 69.2 MiB, delta: 8650752 bytes 8.2 MiB, wrote 13.5% more than predicted
freespace: was free: 56127488 bytes 53.5 MiB, wrote: 63762432 bytes 60.8 MiB, delta: 7634944 bytes 7.3 MiB, wrote 13.6% more than predicted
freespace: was free: 48336896 bytes 46.1 MiB, wrote: 54935552 bytes 52.4 MiB, delta: 6598656 bytes 6.3 MiB, wrote 13.7% more than predicted
freespace: was free: 40587264 bytes 38.7 MiB, wrote: 46157824 bytes 44.0 MiB, delta: 5570560 bytes 5.3 MiB, wrote 13.7% more than predicted
freespace: was free: 32841728 bytes 31.3 MiB, wrote: 37384192 bytes 35.7 MiB, delta: 4542464 bytes 4.3 MiB, wrote 13.8% more than predicted
freespace: was free: 25100288 bytes 23.9 MiB, wrote: 28618752 bytes 27.3 MiB, delta: 3518464 bytes 3.4 MiB, wrote 14.0% more than predicted
freespace: was free: 17342464 bytes 16.5 MiB, wrote: 19841024 bytes 18.9 MiB, delta: 2498560 bytes 2.4 MiB, wrote 14.4% more than predicted
freespace: was free: 9605120 bytes 9.2 MiB, wrote: 11063296 bytes 10.6 MiB, delta: 1458176 bytes 1.4 MiB, wrote 15.2% more than predicted
freespace: Test 2 finished

freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS
freespace: do 10 steps, lessen free space by 7606272 bytes 7.3 MiB each time
freespace: trashing: was free: 83668992 bytes 79.8 MiB, need free: 7606272 bytes 7.3 MiB, files created: 248297, delete 225724 (90.9% of them)
freespace: was free: 70803456 bytes 67.5 MiB, wrote: 82485248 bytes 78.7 MiB, delta: 11681792 bytes 11.1 MiB, wrote 16.5% more than predicted
freespace: trashing: was free: 81080320 bytes 77.3 MiB, need free: 15212544 bytes 14.5 MiB, files created: 248711, delete 202047 (81.2% of them)
freespace: was free: 59867136 bytes 57.1 MiB, wrote: 71897088 bytes 68.6 MiB, delta: 12029952 bytes 11.5 MiB, wrote 20.1% more than predicted
freespace: trashing: was free: 82243584 bytes 78.4 MiB, need free: 22818816 bytes 21.8 MiB, files created: 248866, delete 179817 (72.3% of them)
freespace: was free: 50905088 bytes 48.5 MiB, wrote: 63168512 bytes 60.2 MiB, delta: 12263424 bytes 11.7 MiB, wrote 24.1% more than predicted
freespace: trashing: was free: 83402752 bytes 79.5 MiB, need free: 30425088 bytes 29.0 MiB, files created: 248920, delete 158114 (63.5% of them)
freespace: was free: 42651648 bytes 40.7 MiB, wrote: 55406592 bytes 52.8 MiB, delta: 12754944 bytes 12.2 MiB, wrote 29.9% more than predicted
freespace: trashing: was free: 84402176 bytes 80.5 MiB, need free: 38031360 bytes 36.3 MiB, files created: 248709, delete 136641 (54.9% of them)
freespace: was free: 35233792 bytes 33.6 MiB, wrote: 48250880 bytes 46.0 MiB, delta: 13017088 bytes 12.4 MiB, wrote 36.9% more than predicted
freespace: trashing: was free: 82530304 bytes 78.7 MiB, need free: 45637632 bytes 43.5 MiB, files created: 248778, delete 111208 (44.7% of them)
freespace: was free: 27287552 bytes 26.0 MiB, wrote: 40267776 bytes 38.4 MiB, delta: 12980224 bytes 12.4 MiB, wrote 47.6% more than predicted
freespace: trashing: was free: 85114880 bytes 81.2 MiB, need free: 53243904 bytes 50.8 MiB, files created: 248508, delete 93052 (37.4% of them)
freespace: was free: 22437888 bytes 21.4 MiB, wrote: 35328000 bytes 33.7 MiB, delta: 12890112 bytes 12.3 MiB, wrote 57.4% more than predicted
freespace: trashing: was free: 84103168 bytes 80.2 MiB, need free: 60850176 bytes 58.0 MiB, files created: 248637, delete 68743 (27.6% of them)
freespace: was free: 15536128 bytes 14.8 MiB, wrote: 28319744 bytes 27.0 MiB, delta: 12783616 bytes 12.2 MiB, wrote 82.3% more than predicted
freespace: trashing: was free: 84357120 bytes 80.4 MiB, need free: 68456448 bytes 65.3 MiB, files created: 248567, delete 46852 (18.8% of them)
freespace: was free: 9015296 bytes 8.6 MiB, wrote: 22044672 bytes 21.0 MiB, delta: 13029376 bytes 12.4 MiB, wrote 144.5% more than predicted
freespace: trashing: was free: 84942848 bytes 81.0 MiB, need free: 76062720 bytes 72.5 MiB, files created: 248636, delete 25993 (10.5% of them)
freespace: was free: 6086656 bytes 5.8 MiB, wrote: 8331264 bytes 7.9 MiB, delta: 2244608 bytes 2.1 MiB, wrote 36.9% more than predicted
freespace: Test 3 finished

freespace: finished successfully

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/misc.h   | 32 --------------------------------
 fs/ubifs/ubifs.h  |  1 +
 3 files changed, 46 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 9ef630a594c..7851480a6ce 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -701,6 +701,51 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 	ubifs_release_budget(c, &req);
 }
 
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+	int divisor, factor;
+
+	/*
+	 * Reported space size is @free * X, where X is UBIFS block size
+	 * divided by UBIFS block size + all overhead one data block
+	 * introduces. The overhead is the node header + indexing overhead.
+	 *
+	 * Indexing overhead is calculations are based on the following
+	 * formula: I = N/(f - 1) + 1, where I - number of indexing nodes, N -
+	 * number of data nodes, f - fanout. Because effective UBIFS fanout is
+	 * twice as less than maximum fanout, we assume that each data node
+	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+	 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+	 * for the index.
+	 */
+	factor = UBIFS_BLOCK_SIZE;
+	divisor = UBIFS_MAX_DATA_NODE_SZ;
+	divisor += (c->max_idx_node_sz * 3) / ((c->fanout >> 1) - 1);
+	free *= factor;
+	do_div(free, divisor);
+	return free;
+}
+
 /**
  * ubifs_budg_get_free_space - return amount of free space.
  * @c: UBIFS file-system description object
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 87ced4c74a6..4c12a9215d7 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -283,38 +283,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
 	return (void *)((struct ubifs_branch *)idx->branches)->key;
 }
 
-/**
- * ubifs_reported_space - calculate reported free space.
- * @c: the UBIFS file-system description object
- * @free: amount of free space
- *
- * This function calculates amount of free space which will be reported to
- * user-space. User-space application tend to expect that if the file-system
- * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
- * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
- *
- * This function assumes free space is made up of uncompressed data nodes and
- * full index nodes (one per data node, doubled because we always allow enough
- * space to write the index twice).
- *
- * Note, the calculation is pessimistic, which means that most of the time
- * UBIFS reports less space than it actually has.
- */
-static inline long long ubifs_reported_space(const struct ubifs_info *c,
-					     uint64_t free)
-{
-	int divisor, factor;
-
-	divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3);
-	factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
-	do_div(free, divisor);
-
-	return free * factor;
-}
-
 /**
  * ubifs_current_time - round current time to time granularity.
  * @inode: inode
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 7828d69ca4f..681d46e1628 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1441,6 +1441,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 long long ubifs_budg_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 
 /* find.c */
-- 
cgit v1.2.3


From ad507653a39e0d27404291e5d813683265388a20 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 25 Aug 2008 18:32:57 +0300
Subject: UBIFS: fix assertion

The assertion was incorrect, because it did not take into
account free space.

This patch also amends the comments correspondingly, and
cleans them up a little.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/find.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index adee7b5ddea..9fc55ae7b03 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
  * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
  * or do not have an LEB which satisfies the @min_space criteria.
  *
- * Note:
- *   o LEBs which have less than dead watermark of dirty space are never picked
- *   by this function;
- *
- * Returns zero and the LEB properties of
- * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
- * negative error code in case of other failures. The returned LEB is marked as
- * "taken".
+ * Note, LEBs which have less than dead watermark of free + dirty space are
+ * never picked by this function.
  *
  * The additional @pick_free argument controls if this function has to return a
  * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
  *
  * In addition @pick_free is set to %2 by the recovery process in order to
  * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
  */
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 			 int min_space, int pick_free)
@@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 		lp = idx_lp;
 
 	if (lp) {
-		ubifs_assert(lp->dirty >= c->dead_wm);
+		ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
 		goto found;
 	}
 
-- 
cgit v1.2.3


From 131130b9a1e6e523c64b34137b14f88ae1382a6a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 25 Aug 2008 18:34:45 +0300
Subject: UBIFS: add forgotten gc_idx_lebs component

We add this component at other similar places, but not in this
one.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/find.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 9fc55ae7b03..e045c8b5542 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -243,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 		int lebs, rsvd_idx_lebs = 0;
 
 		spin_lock(&c->space_lock);
-		lebs = c->lst.empty_lebs;
+		lebs = c->lst.empty_lebs + c->idx_gc_cnt;
 		lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
 
 		/*
-- 
cgit v1.2.3


From 9bbb5726efb64e2cfed42f6eec07db80cd87e63b Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 22 Aug 2008 18:23:22 +0300
Subject: UBIFS: introduce LEB overhead

This is a preparational patch for the following statfs()
report fix.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 6 ++++++
 fs/ubifs/ubifs.h | 5 +++++
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1018053519e..be23fd3cfd8 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -530,6 +530,12 @@ static int init_constants_early(struct ubifs_info *c)
 	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
 	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
 
+	/*
+	 * Calculate how many bytes would be wasted at the end of LEB if it was
+	 * fully filled with data nodes of maximum size. This is used in
+	 * calculations when reporting free space.
+	 */
+	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
 	return 0;
 }
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 681d46e1628..57e58541de2 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -995,6 +995,9 @@ struct ubifs_mount_opts {
  * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
  * @max_inode_sz: maximum possible inode size in bytes
  * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
  * @dead_wm: LEB dead space watermark
  * @dark_wm: LEB dark space watermark
  * @block_cnt: count of 4KiB blocks on the FS
@@ -1226,6 +1229,8 @@ struct ubifs_info {
 	int max_idx_node_sz;
 	long long max_inode_sz;
 	int max_znode_sz;
+
+	int leb_overhead;
 	int dead_wm;
 	int dark_wm;
 	int block_cnt;
-- 
cgit v1.2.3


From 7dad181bbe58b8fe9e170da28bcd5f6ec9addd6d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 25 Aug 2008 18:58:19 +0300
Subject: UBIFS: improve statfs reporting even more

Since free space we report in statfs is file size which should
fit to the FS - change the way we calculate free space and use
leb_overhead instead of dark_wm in calculations.

Results of "freespace" test (120MiB volume, 16KiB LEB size,
512 bytes page size). Before the change:

freespace: Test 1: fill the space we have 3 times
freespace: was free: 85204992 bytes 81.3 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 11284480 bytes 10.8 MiB, wrote 13.2% more than predicted
freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 12935168 bytes 12.3 MiB, wrote 15.5% more than predicted
freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 12939264 bytes 12.3 MiB, wrote 15.5% more than predicted
freespace: Test 1 finished

freespace: Test 2: gradually lessen amount of free space and fill the FS
freespace: do 10 steps, lessen free space by 7596218 bytes 7.2 MiB each time
freespace: was free: 78675968 bytes 75.0 MiB, wrote: 88903680 bytes 84.8 MiB, delta: 10227712 bytes 9.8 MiB, wrote 13.0% more than predicted
freespace: was free: 72015872 bytes 68.7 MiB, wrote: 81514496 bytes 77.7 MiB, delta: 9498624 bytes 9.1 MiB, wrote 13.2% more than predicted
freespace: was free: 63938560 bytes 61.0 MiB, wrote: 72589312 bytes 69.2 MiB, delta: 8650752 bytes 8.2 MiB, wrote 13.5% more than predicted
freespace: was free: 56127488 bytes 53.5 MiB, wrote: 63762432 bytes 60.8 MiB, delta: 7634944 bytes 7.3 MiB, wrote 13.6% more than predicted
freespace: was free: 48336896 bytes 46.1 MiB, wrote: 54935552 bytes 52.4 MiB, delta: 6598656 bytes 6.3 MiB, wrote 13.7% more than predicted
freespace: was free: 40587264 bytes 38.7 MiB, wrote: 46157824 bytes 44.0 MiB, delta: 5570560 bytes 5.3 MiB, wrote 13.7% more than predicted
freespace: was free: 32841728 bytes 31.3 MiB, wrote: 37384192 bytes 35.7 MiB, delta: 4542464 bytes 4.3 MiB, wrote 13.8% more than predicted
freespace: was free: 25100288 bytes 23.9 MiB, wrote: 28618752 bytes 27.3 MiB, delta: 3518464 bytes 3.4 MiB, wrote 14.0% more than predicted
freespace: was free: 17342464 bytes 16.5 MiB, wrote: 19841024 bytes 18.9 MiB, delta: 2498560 bytes 2.4 MiB, wrote 14.4% more than predicted
freespace: was free: 9605120 bytes 9.2 MiB, wrote: 11063296 bytes 10.6 MiB, delta: 1458176 bytes 1.4 MiB, wrote 15.2% more than predicted
freespace: Test 2 finished

freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS
freespace: do 10 steps, lessen free space by 7606272 bytes 7.3 MiB each time
freespace: trashing: was free: 83668992 bytes 79.8 MiB, need free: 7606272 bytes 7.3 MiB, files created: 248297, delete 225724 (90.9% of them)
freespace: was free: 70803456 bytes 67.5 MiB, wrote: 82485248 bytes 78.7 MiB, delta: 11681792 bytes 11.1 MiB, wrote 16.5% more than predicted
freespace: trashing: was free: 81080320 bytes 77.3 MiB, need free: 15212544 bytes 14.5 MiB, files created: 248711, delete 202047 (81.2% of them)
freespace: was free: 59867136 bytes 57.1 MiB, wrote: 71897088 bytes 68.6 MiB, delta: 12029952 bytes 11.5 MiB, wrote 20.1% more than predicted
freespace: trashing: was free: 82243584 bytes 78.4 MiB, need free: 22818816 bytes 21.8 MiB, files created: 248866, delete 179817 (72.3% of them)
freespace: was free: 50905088 bytes 48.5 MiB, wrote: 63168512 bytes 60.2 MiB, delta: 12263424 bytes 11.7 MiB, wrote 24.1% more than predicted
freespace: trashing: was free: 83402752 bytes 79.5 MiB, need free: 30425088 bytes 29.0 MiB, files created: 248920, delete 158114 (63.5% of them)
freespace: was free: 42651648 bytes 40.7 MiB, wrote: 55406592 bytes 52.8 MiB, delta: 12754944 bytes 12.2 MiB, wrote 29.9% more than predicted
freespace: trashing: was free: 84402176 bytes 80.5 MiB, need free: 38031360 bytes 36.3 MiB, files created: 248709, delete 136641 (54.9% of them)
freespace: was free: 35233792 bytes 33.6 MiB, wrote: 48250880 bytes 46.0 MiB, delta: 13017088 bytes 12.4 MiB, wrote 36.9% more than predicted
freespace: trashing: was free: 82530304 bytes 78.7 MiB, need free: 45637632 bytes 43.5 MiB, files created: 248778, delete 111208 (44.7% of them)
freespace: was free: 27287552 bytes 26.0 MiB, wrote: 40267776 bytes 38.4 MiB, delta: 12980224 bytes 12.4 MiB, wrote 47.6% more than predicted
freespace: trashing: was free: 85114880 bytes 81.2 MiB, need free: 53243904 bytes 50.8 MiB, files created: 248508, delete 93052 (37.4% of them)
freespace: was free: 22437888 bytes 21.4 MiB, wrote: 35328000 bytes 33.7 MiB, delta: 12890112 bytes 12.3 MiB, wrote 57.4% more than predicted
freespace: trashing: was free: 84103168 bytes 80.2 MiB, need free: 60850176 bytes 58.0 MiB, files created: 248637, delete 68743 (27.6% of them)
freespace: was free: 15536128 bytes 14.8 MiB, wrote: 28319744 bytes 27.0 MiB, delta: 12783616 bytes 12.2 MiB, wrote 82.3% more than predicted
freespace: trashing: was free: 84357120 bytes 80.4 MiB, need free: 68456448 bytes 65.3 MiB, files created: 248567, delete 46852 (18.8% of them)
freespace: was free: 9015296 bytes 8.6 MiB, wrote: 22044672 bytes 21.0 MiB, delta: 13029376 bytes 12.4 MiB, wrote 144.5% more than predicted
freespace: trashing: was free: 84942848 bytes 81.0 MiB, need free: 76062720 bytes 72.5 MiB, files created: 248636, delete 25993 (10.5% of them)
freespace: was free: 6086656 bytes 5.8 MiB, wrote: 8331264 bytes 7.9 MiB, delta: 2244608 bytes 2.1 MiB, wrote 36.9% more than predicted
freespace: Test 3 finished

freespace: finished successfully

After the change:

freespace: Test 1: fill the space we have 3 times
freespace: was free: 94048256 bytes 89.7 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 2441216 bytes 2.3 MiB, wrote 2.6% more than predicted
freespace: was free: 92246016 bytes 88.0 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 4247552 bytes 4.1 MiB, wrote 4.6% more than predicted
freespace: was free: 92254208 bytes 88.0 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 4235264 bytes 4.0 MiB, wrote 4.6% more than predicted
freespace: Test 1 finished

freespace: Test 2: gradually lessen amount of free space and fill the FS
freespace: do 10 steps, lessen free space by 8386001 bytes 8.0 MiB each time
freespace: was free: 86605824 bytes 82.6 MiB, wrote: 88252416 bytes 84.2 MiB, delta: 1646592 bytes 1.6 MiB, wrote 1.9% more than predicted
freespace: was free: 78667776 bytes 75.0 MiB, wrote: 80715776 bytes 77.0 MiB, delta: 2048000 bytes 2.0 MiB, wrote 2.6% more than predicted
freespace: was free: 69615616 bytes 66.4 MiB, wrote: 71630848 bytes 68.3 MiB, delta: 2015232 bytes 1.9 MiB, wrote 2.9% more than predicted
freespace: was free: 61018112 bytes 58.2 MiB, wrote: 62783488 bytes 59.9 MiB, delta: 1765376 bytes 1.7 MiB, wrote 2.9% more than predicted
freespace: was free: 52424704 bytes 50.0 MiB, wrote: 53968896 bytes 51.5 MiB, delta: 1544192 bytes 1.5 MiB, wrote 2.9% more than predicted
freespace: was free: 43880448 bytes 41.8 MiB, wrote: 45199360 bytes 43.1 MiB, delta: 1318912 bytes 1.3 MiB, wrote 3.0% more than predicted
freespace: was free: 35332096 bytes 33.7 MiB, wrote: 36425728 bytes 34.7 MiB, delta: 1093632 bytes 1.0 MiB, wrote 3.1% more than predicted
freespace: was free: 26771456 bytes 25.5 MiB, wrote: 27643904 bytes 26.4 MiB, delta: 872448 bytes 852.0 KiB, wrote 3.3% more than predicted
freespace: was free: 18231296 bytes 17.4 MiB, wrote: 18878464 bytes 18.0 MiB, delta: 647168 bytes 632.0 KiB, wrote 3.5% more than predicted
freespace: was free: 9674752 bytes 9.2 MiB, wrote: 10088448 bytes 9.6 MiB, delta: 413696 bytes 404.0 KiB, wrote 4.3% more than predicted
freespace: Test 2 finished

freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS
freespace: do 10 steps, lessen free space by 8397544 bytes 8.0 MiB each time
freespace: trashing: was free: 92372992 bytes 88.1 MiB, need free: 8397552 bytes 8.0 MiB, files created: 248296, delete 225723 (90.9% of them)
freespace: was free: 71909376 bytes 68.6 MiB, wrote: 82472960 bytes 78.7 MiB, delta: 10563584 bytes 10.1 MiB, wrote 14.7% more than predicted
freespace: trashing: was free: 88989696 bytes 84.9 MiB, need free: 16795096 bytes 16.0 MiB, files created: 248794, delete 201838 (81.1% of them)
freespace: was free: 60354560 bytes 57.6 MiB, wrote: 71782400 bytes 68.5 MiB, delta: 11427840 bytes 10.9 MiB, wrote 18.9% more than predicted
freespace: trashing: was free: 90304512 bytes 86.1 MiB, need free: 25192640 bytes 24.0 MiB, files created: 248733, delete 179342 (72.1% of them)
freespace: was free: 51187712 bytes 48.8 MiB, wrote: 62943232 bytes 60.0 MiB, delta: 11755520 bytes 11.2 MiB, wrote 23.0% more than predicted
freespace: trashing: was free: 91209728 bytes 87.0 MiB, need free: 33590184 bytes 32.0 MiB, files created: 248779, delete 157160 (63.2% of them)
freespace: was free: 42704896 bytes 40.7 MiB, wrote: 55050240 bytes 52.5 MiB, delta: 12345344 bytes 11.8 MiB, wrote 28.9% more than predicted
freespace: trashing: was free: 92700672 bytes 88.4 MiB, need free: 41987728 bytes 40.0 MiB, files created: 248848, delete 136135 (54.7% of them)
freespace: was free: 35250176 bytes 33.6 MiB, wrote: 48115712 bytes 45.9 MiB, delta: 12865536 bytes 12.3 MiB, wrote 36.5% more than predicted
freespace: trashing: was free: 93986816 bytes 89.6 MiB, need free: 50385272 bytes 48.1 MiB, files created: 248723, delete 115385 (46.4% of them)
freespace: was free: 29995008 bytes 28.6 MiB, wrote: 41582592 bytes 39.7 MiB, delta: 11587584 bytes 11.1 MiB, wrote 38.6% more than predicted
freespace: trashing: was free: 91881472 bytes 87.6 MiB, need free: 58782816 bytes 56.1 MiB, files created: 248645, delete 89569 (36.0% of them)
freespace: was free: 22511616 bytes 21.5 MiB, wrote: 34705408 bytes 33.1 MiB, delta: 12193792 bytes 11.6 MiB, wrote 54.2% more than predicted
freespace: trashing: was free: 91774976 bytes 87.5 MiB, need free: 67180360 bytes 64.1 MiB, files created: 248580, delete 66616 (26.8% of them)
freespace: was free: 16908288 bytes 16.1 MiB, wrote: 26898432 bytes 25.7 MiB, delta: 9990144 bytes 9.5 MiB, wrote 59.1% more than predicted
freespace: trashing: was free: 92450816 bytes 88.2 MiB, need free: 75577904 bytes 72.1 MiB, files created: 248654, delete 45381 (18.3% of them)
freespace: was free: 10170368 bytes 9.7 MiB, wrote: 19111936 bytes 18.2 MiB, delta: 8941568 bytes 8.5 MiB, wrote 87.9% more than predicted
freespace: trashing: was free: 93282304 bytes 89.0 MiB, need free: 83975448 bytes 80.1 MiB, files created: 248513, delete 24794 (10.0% of them)
freespace: was free: 3911680 bytes 3.7 MiB, wrote: 7872512 bytes 7.5 MiB, delta: 3960832 bytes 3.8 MiB, wrote 101.3% more than predicted
freespace: Test 3 finished

freespace: finished successfully

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 38 ++++++++++++++++++++++++++++++++++----
 fs/ubifs/super.c  | 10 +++++-----
 fs/ubifs/ubifs.h  |  2 +-
 3 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 7851480a6ce..101d278c591 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -747,14 +747,24 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
 }
 
 /**
- * ubifs_budg_get_free_space - return amount of free space.
+ * ubifs_get_free_space - return amount of free space.
  * @c: UBIFS file-system description object
  *
- * This function returns amount of free space on the file-system.
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
-long long ubifs_budg_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space(struct ubifs_info *c)
 {
-	int min_idx_lebs;
+	int min_idx_lebs, rsvd_idx_lebs, lebs;
 	long long available, outstanding, free;
 
 	spin_lock(&c->space_lock);
@@ -771,6 +781,26 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c)
 	}
 
 	available = ubifs_calc_available(c, min_idx_lebs);
+
+	/*
+	 * When reporting free space to user-space, UBIFS guarantees that it is
+	 * possible to write a file of free space size. This means that for
+	 * empty LEBs we may use more precise calculations than
+	 * 'ubifs_calc_available()' is using. Namely, we know that in empty
+	 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+	 * Thus, amend the available space.
+	 *
+	 * Note, the calculations below are similar to what we have in
+	 * 'do_budget_space()', so refer there for comments.
+	 */
+	if (min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	lebs -= rsvd_idx_lebs;
+	available += lebs * (c->dark_wm - c->leb_overhead);
 	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index be23fd3cfd8..1207bd51ead 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -371,7 +371,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ubifs_info *c = dentry->d_sb->s_fs_info;
 	unsigned long long free;
 
-	free = ubifs_budg_get_free_space(c);
+	free = ubifs_get_free_space(c);
 	dbg_gen("free space %lld bytes (%lld blocks)",
 		free, free >> UBIFS_BLOCK_SHIFT);
 
@@ -653,11 +653,11 @@ static int init_constants_late(struct ubifs_info *c)
 	 * internally because it does not make much sense for UBIFS, but it is
 	 * necessary to report something for the 'statfs()' call.
 	 *
-	 * Subtract the LEB reserved for GC and the LEB which is reserved for
-	 * deletions.
+	 * Subtract the LEB reserved for GC, the LEB which is reserved for
+	 * deletions, and assume only one journal head is available.
 	 */
-	tmp64 = c->main_lebs - 2;
-	tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+	tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+	tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
 	tmp64 = ubifs_reported_space(c, tmp64);
 	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 57e58541de2..17c620b93ee 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1443,7 +1443,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
 				struct ubifs_budget_req *req);
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 			 struct ubifs_budget_req *req);
-long long ubifs_budg_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
 long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
-- 
cgit v1.2.3


From b3385c278d3c32aec68d4900b35bc07df1b2240c Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 31 Aug 2008 17:13:18 +0300
Subject: UBIFS: fill f_fsid

UBIFS stores 16-bit UUID in the superblock, and it is a good
idea to return part of it in 'f_fsid' filed of kstatfs structure.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1207bd51ead..0dee4042c6c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -386,6 +386,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;
 	buf->f_ffree = 0;
 	buf->f_namelen = UBIFS_MAX_NLEN;
+	memcpy(&buf->f_fsid, c->uuid, sizeof(__kernel_fsid_t));
 
 	return 0;
 }
-- 
cgit v1.2.3


From 44be6fdf1056b685eb79e53e42bd2d321b085cfc Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 28 Aug 2008 11:36:19 -0500
Subject: dlm: fix address compare

Compare only the addr and port fields of sockaddr structures.
Fixes a problem with ipv6 where sin6_scope_id does not match.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 89d2fb7b991..1359be3b0bb 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <net/sock.h>
 
 #include "config.h"
@@ -776,6 +779,33 @@ static void put_space(struct dlm_space *sp)
 	config_item_put(&sp->group.cg_item);
 }
 
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+	switch (x->ss_family) {
+	case AF_INET: {
+		struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+		struct sockaddr_in *siny = (struct sockaddr_in *)y;
+		if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+			return 0;
+		if (sinx->sin_port != siny->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+		struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+		if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+			return 0;
+		if (sinx->sin6_port != siny->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+	return 1;
+}
+
 static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 {
 	struct config_item *i;
@@ -797,8 +827,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 			config_item_get(i);
 			break;
 		} else {
-			if (!cm->addr_count ||
-			    memcmp(cm->addr[0], addr, sizeof(*addr)))
+			if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
 				continue;
 			found = 1;
 			config_item_get(i);
-- 
cgit v1.2.3


From 169ccbd44eb20f5bb7e4352451eba25397e29749 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 2 Sep 2008 14:35:37 -0700
Subject: NTFS: update homepage

Update the location of the NTFS homepage in several files.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/usnjrnl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e..4087fbdac32 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
  * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
  * file since it was last opened.  I think the names speak for themselves but
  * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * documentation: http://www.linux-ntfs.org/
  */
 enum {
 	USN_REASON_DATA_OVERWRITE	= const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
  * Source info flags (32-bit).  Information about the source of the change(s)
  * to the file.  For detailed descriptions of what these mean, see the Linux
  * NTFS project NTFS documentation:
- *	http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ *	http://www.linux-ntfs.org/
  */
 enum {
 	USN_SOURCE_DATA_MANAGEMENT	  = const_cpu_to_le32(0x00000001),
-- 
cgit v1.2.3


From 4b8561521dbaa3d766b198496b220e984e3bf756 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 2 Sep 2008 14:35:53 -0700
Subject: mm: show quicklist usage in /proc/meminfo

Quicklists can consume several GB of memory.  We should provide a means of
monitoring this.

After this patch is applied, /proc/meminfo will output the following:

% cat /proc/meminfo

MemTotal:      7715392 kB
MemFree:       5401600 kB
Buffers:         80384 kB
Cached:         300800 kB
SwapCached:          0 kB
Active:         235584 kB
Inactive:       262656 kB
SwapTotal:     2031488 kB
SwapFree:      2031488 kB
Dirty:            3520 kB
Writeback:           0 kB
AnonPages:      117696 kB
Mapped:          38528 kB
Slab:          1589952 kB
SReclaimable:    23104 kB
SUnreclaim:    1566848 kB
PageTables:      14656 kB
NFS_Unstable:        0 kB
Bounce:              0 kB
WritebackTmp:        0 kB
CommitLimit:   5889152 kB
Committed_AS:   393152 kB
VmallocTotal: 17592177655808 kB
VmallocUsed:     29056 kB
VmallocChunk: 17592177626432 kB
Quicklists:     130944 kB
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
HugePages_Surp:      0
Hugepagesize:    262144 kB

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Keiichiro Tokunaga <tokunaga.keiich@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded96986296..00f10a2dcf1 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
+#include <linux/quicklist.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
@@ -189,7 +190,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
 		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n",
+		"VmallocChunk: %8lu kB\n"
+		"Quicklists:   %8lu kB\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
@@ -221,7 +223,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
-		vmi.largest_chunk >> 10
+		vmi.largest_chunk >> 10,
+		K(quicklist_total_size())
 		);
 
 		len += hugetlb_report_meminfo(page + len);
-- 
cgit v1.2.3


From 7c7cbadf7341a0792879c67d6e3020f040d6cd7f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 3 Sep 2008 14:16:42 +0300
Subject: UBIFS: amend f_fsid

David Woodhouse suggested to be consistent with other FSes
and xor the beginning and the end of the UUID.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0dee4042c6c..7562464ac83 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -370,6 +370,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ubifs_info *c = dentry->d_sb->s_fs_info;
 	unsigned long long free;
+	__le32 *uuid = (__le32 *)c->uuid;
 
 	free = ubifs_get_free_space(c);
 	dbg_gen("free space %lld bytes (%lld blocks)",
@@ -386,8 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;
 	buf->f_ffree = 0;
 	buf->f_namelen = UBIFS_MAX_NLEN;
-	memcpy(&buf->f_fsid, c->uuid, sizeof(__kernel_fsid_t));
-
+	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f9f2ed486256f3480e4d499ffd6bf730bc5e6fc6 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 4 Sep 2008 12:51:20 -0500
Subject: dlm: remove bkl

BLK from recent pushdown is not needed.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 81627b502a5..b3832c67194 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,7 +15,6 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
-#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
 
@@ -637,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file)
 	struct dlm_user_proc *proc;
 	struct dlm_ls *ls;
 
-	lock_kernel();
 	ls = dlm_find_lockspace_device(iminor(inode));
-	if (!ls) {
-		unlock_kernel();
+	if (!ls)
 		return -ENOENT;
-	}
 
 	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
 	if (!proc) {
 		dlm_put_lockspace(ls);
-		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -659,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file)
 	spin_lock_init(&proc->locks_spin);
 	init_waitqueue_head(&proc->wait);
 	file->private_data = proc;
-	unlock_kernel();
 
 	return 0;
 }
@@ -914,7 +908,6 @@ int dlm_user_daemon_available(void)
 
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
-	cycle_kernel_lock();
 	file->private_data = NULL;
 	return 0;
 }
-- 
cgit v1.2.3


From dff5257473ca1e05002809809f51f858e9a966fc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 2 Sep 2008 13:33:17 +0100
Subject: GFS2: Fix race relating to glock min-hold time

In the case that a request for a glock arrives right after the
grant reply has arrived, it sometimes means that the gl_tstamp
field hasn't been updated recently enough. The net result is that
the min-hold time for the glock is ignored. If this happens
often enough, it leads to poor performance.

This patch adds an additional test, so that if the reply pending
bit is set on a glock, then it will select the maximum length of
time for the min-hold time, rather than looking at gl_tstamp.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4cbb6957a0d..806e1eb0aa0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
+	if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+		delay = gl->gl_ops->go_min_hold_time;
 
 	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, 1, delay);
-- 
cgit v1.2.3


From bd1eb8818cc2c8ddab86be027ab43fb852942704 Mon Sep 17 00:00:00 2001
From: Julien Brunel <brunel@diku.dk>
Date: Mon, 1 Sep 2008 10:51:22 +0200
Subject: GFS2: Use an IS_ERR test rather than a NULL test

In case of error, the function gfs2_inode_lookup returns an
ERR pointer, but never returns a NULL pointer. So a NULL test that
necessarily comes after an IS_ERR test should be deleted, and a NULL
test that may come after a call to this function should be
strengthened by an IS_ERR test.

The semantic match that finds this problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@match_bad_null_test@
expression x, E;
statement S1,S2;
@@
x = gfs2_inode_lookup(...)
... when != x = E
* if (x != NULL)
S1 else S2
// </smpl>

Signed-off-by:  Julien Brunel <brunel@diku.dk>
Signed-off-by:  Julia Lawall <julia@diku.dk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 87525523446..c8a959c09f1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1033,13 +1033,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 
 	if (bh)
 		brelse(bh);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
 	return inode;
 
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
-	if (inode)
+	if (inode && !IS_ERR(inode))
 		iput(inode);
 fail_gunlock:
 	gfs2_glock_dq(ghs);
-- 
cgit v1.2.3


From 27eccf46491e1f77f9af9bbe0778122ce6882890 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 5 Sep 2008 08:42:08 -0500
Subject: dlm: choose better identifiers

sparc32:

fs/dlm/config.c:397: error: expected identifier or '(' before '{' token
fs/dlm/config.c: In function 'drop_node':
fs/dlm/config.c:589: warning: initialization from incompatible pointer type
fs/dlm/config.c:589: warning: initialization from incompatible pointer type
fs/dlm/config.c: In function 'release_node':
fs/dlm/config.c:601: warning: initialization from incompatible pointer type
fs/dlm/config.c:601: warning: initialization from incompatible pointer type
fs/dlm/config.c: In function 'show_node':
fs/dlm/config.c:717: warning: initialization from incompatible pointer type
fs/dlm/config.c:717: warning: initialization from incompatible pointer type
fs/dlm/config.c: In function 'store_node':
fs/dlm/config.c:726: warning: initialization from incompatible pointer type
fs/dlm/config.c:726: warning: initialization from incompatible pointer type

Cc: Christine Caulfield <ccaulfie@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 1359be3b0bb..fd9859f92fa 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -380,24 +380,24 @@ static struct config_item_type node_type = {
 	.ct_owner = THIS_MODULE,
 };
 
-static struct dlm_cluster *to_cluster(struct config_item *i)
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
 {
 	return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
 		   NULL;
 }
 
-static struct dlm_space *to_space(struct config_item *i)
+static struct dlm_space *config_item_to_space(struct config_item *i)
 {
 	return i ? container_of(to_config_group(i), struct dlm_space, group) :
 		   NULL;
 }
 
-static struct dlm_comm *to_comm(struct config_item *i)
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
 {
 	return i ? container_of(i, struct dlm_comm, item) : NULL;
 }
 
-static struct dlm_node *to_node(struct config_item *i)
+static struct dlm_node *config_item_to_node(struct config_item *i)
 {
 	return i ? container_of(i, struct dlm_node, item) : NULL;
 }
@@ -453,7 +453,7 @@ static struct config_group *make_cluster(struct config_group *g,
 
 static void drop_cluster(struct config_group *g, struct config_item *i)
 {
-	struct dlm_cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	struct config_item *tmp;
 	int j;
 
@@ -471,7 +471,7 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
 
 static void release_cluster(struct config_item *i)
 {
-	struct dlm_cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	kfree(cl->group.default_groups);
 	kfree(cl);
 }
@@ -510,7 +510,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 
 static void drop_space(struct config_group *g, struct config_item *i)
 {
-	struct dlm_space *sp = to_space(i);
+	struct dlm_space *sp = config_item_to_space(i);
 	struct config_item *tmp;
 	int j;
 
@@ -527,7 +527,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
 
 static void release_space(struct config_item *i)
 {
-	struct dlm_space *sp = to_space(i);
+	struct dlm_space *sp = config_item_to_space(i);
 	kfree(sp->group.default_groups);
 	kfree(sp);
 }
@@ -549,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 
 static void drop_comm(struct config_group *g, struct config_item *i)
 {
-	struct dlm_comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	if (local_comm == cm)
 		local_comm = NULL;
 	dlm_lowcomms_close(cm->nodeid);
@@ -560,13 +560,13 @@ static void drop_comm(struct config_group *g, struct config_item *i)
 
 static void release_comm(struct config_item *i)
 {
-	struct dlm_comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	kfree(cm);
 }
 
 static struct config_item *make_node(struct config_group *g, const char *name)
 {
-	struct dlm_space *sp = to_space(g->cg_item.ci_parent);
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
 	struct dlm_node *nd;
 
 	nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
@@ -588,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 
 static void drop_node(struct config_group *g, struct config_item *i)
 {
-	struct dlm_space *sp = to_space(g->cg_item.ci_parent);
-	struct dlm_node *nd = to_node(i);
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd = config_item_to_node(i);
 
 	mutex_lock(&sp->members_lock);
 	list_del(&nd->list);
@@ -601,7 +601,7 @@ static void drop_node(struct config_group *g, struct config_item *i)
 
 static void release_node(struct config_item *i)
 {
-	struct dlm_node *nd = to_node(i);
+	struct dlm_node *nd = config_item_to_node(i);
 	kfree(nd);
 }
 
@@ -635,7 +635,7 @@ void dlm_config_exit(void)
 static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
 			    char *buf)
 {
-	struct dlm_cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	struct cluster_attribute *cla =
 			container_of(a, struct cluster_attribute, attr);
 	return cla->show ? cla->show(cl, buf) : 0;
@@ -645,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
 			     struct configfs_attribute *a,
 			     const char *buf, size_t len)
 {
-	struct dlm_cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	struct cluster_attribute *cla =
 		container_of(a, struct cluster_attribute, attr);
 	return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -654,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 			 char *buf)
 {
-	struct dlm_comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	struct comm_attribute *cma =
 			container_of(a, struct comm_attribute, attr);
 	return cma->show ? cma->show(cm, buf) : 0;
@@ -663,7 +663,7 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
 			  const char *buf, size_t len)
 {
-	struct dlm_comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	struct comm_attribute *cma =
 		container_of(a, struct comm_attribute, attr);
 	return cma->store ? cma->store(cm, buf, len) : -EINVAL;
@@ -717,7 +717,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 			 char *buf)
 {
-	struct dlm_node *nd = to_node(i);
+	struct dlm_node *nd = config_item_to_node(i);
 	struct node_attribute *nda =
 			container_of(a, struct node_attribute, attr);
 	return nda->show ? nda->show(nd, buf) : 0;
@@ -726,7 +726,7 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
 			  const char *buf, size_t len)
 {
-	struct dlm_node *nd = to_node(i);
+	struct dlm_node *nd = config_item_to_node(i);
 	struct node_attribute *nda =
 		container_of(a, struct node_attribute, attr);
 	return nda->store ? nda->store(nd, buf, len) : -EINVAL;
@@ -771,7 +771,7 @@ static struct dlm_space *get_space(char *name)
 	i = config_group_find_item(space_list, name);
 	mutex_unlock(&space_list->cg_subsys->su_mutex);
 
-	return to_space(i);
+	return config_item_to_space(i);
 }
 
 static void put_space(struct dlm_space *sp)
@@ -818,7 +818,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 	mutex_lock(&clusters_root.subsys.su_mutex);
 
 	list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
-		cm = to_comm(i);
+		cm = config_item_to_comm(i);
 
 		if (nodeid) {
 			if (cm->nodeid != nodeid)
-- 
cgit v1.2.3


From 49048622eae698e5c4ae61f7e71200f265ccc529 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Fri, 5 Sep 2008 18:12:23 +0200
Subject: sched: fix process time monotonicity

Spencer reported a problem where utime and stime were going negative despite
the fixes in commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa. The suspected
reason for the problem is that signal_struct maintains it's own utime and
stime (of exited tasks), these are not updated using the new task_utime()
routine, hence sig->utime can go backwards and cause the same problem
to occur (sig->utime, adds tsk->utime and not task_utime()). This patch
fixes the problem

TODO: using max(task->prev_utime, derived utime) works for now, but a more
generic solution is to implement cputime_max() and use the cputime_gt()
function for comparison.

Reported-by: spencer@bluehost.com
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c | 59 ---------------------------------------------------------
 1 file changed, 59 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c..71c9be59c9c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -337,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
-	return p->utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
-	return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
-	u64 temp;
-
-	/*
-	 * Use CFS's precise accounting:
-	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-
-	if (total) {
-		temp *= utime;
-		do_div(temp, total);
-	}
-	utime = (clock_t)temp;
-
-	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-	return p->prev_utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
-	clock_t stime;
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-			cputime_to_clock_t(task_utime(p));
-
-	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-
-	return p->prev_stime;
-}
-#endif
-
-static cputime_t task_gtime(struct task_struct *p)
-{
-	return p->gtime;
-}
-
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task, int whole)
 {
-- 
cgit v1.2.3


From f171d4d769c8ccac6675892960e37f6485837fae Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 3 Sep 2008 16:17:14 +0300
Subject: UBIFS: fix division by zero

If fanout is 3, we have division by zero in
'ubifs_read_superblock()':

divide error: 0000 [#1] PREEMPT SMP

Pid: 28744, comm: mount Not tainted (2.6.27-rc4-ubifs-2.6 #23)
EIP: 0060:[<f8f9e3ef>] EFLAGS: 00010202 CPU: 0
EIP is at ubifs_reported_space+0x2d/0x69 [ubifs]
EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000000
ESI: 00000000 EDI: f0ae64b0 EBP: f1f9fcf4 ESP: f1f9fce0
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 101d278c591..73db464cd08 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -723,24 +723,25 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
  */
 long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
 {
-	int divisor, factor;
+	int divisor, factor, f;
 
 	/*
 	 * Reported space size is @free * X, where X is UBIFS block size
 	 * divided by UBIFS block size + all overhead one data block
 	 * introduces. The overhead is the node header + indexing overhead.
 	 *
-	 * Indexing overhead is calculations are based on the following
-	 * formula: I = N/(f - 1) + 1, where I - number of indexing nodes, N -
-	 * number of data nodes, f - fanout. Because effective UBIFS fanout is
-	 * twice as less than maximum fanout, we assume that each data node
+	 * Indexing overhead calculations are based on the following formula:
+	 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+	 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+	 * as less than maximum fanout, we assume that each data node
 	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
 	 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
 	 * for the index.
 	 */
+	f = c->fanout > 3 ? c->fanout >> 1 : 2;
 	factor = UBIFS_BLOCK_SIZE;
 	divisor = UBIFS_MAX_DATA_NODE_SZ;
-	divisor += (c->max_idx_node_sz * 3) / ((c->fanout >> 1) - 1);
+	divisor += (c->max_idx_node_sz * 3) / (f - 1);
 	free *= factor;
 	do_div(free, divisor);
 	return free;
-- 
cgit v1.2.3


From a5cb562d6977d9d7989c346b7b153cef31ec0228 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 3 Sep 2008 18:26:47 +0300
Subject: UBIFS: make minimum fanout 3

UBIFS does not really work correctly when fanout is 2,
because of the way we manage the indexing tree. It may
just become a list and UBIFS screws up.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/ubifs-media.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index bd2121f3426..a9ecbd9af20 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
 #define UBIFS_SK_LEN 8
 
 /* Minimum index tree fanout */
-#define UBIFS_MIN_FANOUT 2
+#define UBIFS_MIN_FANOUT 3
 
 /* Maximum number of levels in UBIFS indexing B-tree */
 #define UBIFS_MAX_LEVELS 512
-- 
cgit v1.2.3


From a30d542a0035b886ffaafd0057ced0a2b28c3a4f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 9 Oct 2008 10:56:23 -0400
Subject: ext4: Make sure all the block allocation paths reserve blocks

With delayed allocation we need to make sure block are reserved before
we attempt to allocate them. Otherwise we get block allocation failure
(ENOSPC) during writepages which cannot be handled. This would mean
silent data loss (We do a printk stating data will be lost). This patch
updates the DIO and fallocate code path to do block reservation before
block allocation. This is needed to make sure parallel DIO and fallocate
request doesn't take block out of delayed reserve space.

When free blocks count go below a threshold we switch to a slow patch
which looks at other CPU's accumulated percpu counter values.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 58 ++++++++++++++++++++++++++++++++++++++++---------------
 fs/ext4/ext4.h    | 13 +++++++++++++
 fs/ext4/inode.c   |  5 +----
 fs/ext4/mballoc.c | 23 ++++++++++++----------
 4 files changed, 69 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 58005c01abb..1707850301d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1602,6 +1602,32 @@ out:
 	return ret;
 }
 
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks)
+{
+	s64 free_blocks;
+	ext4_fsblk_t root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+
+	free_blocks = percpu_counter_read(fbc);
+
+	if (!capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
+
+	if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+		free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
+
+	if (free_blocks < (root_blocks + nblocks))
+		/* we don't have free space */
+		return -ENOSPC;
+
+	/* reduce fs free blocks counter */
+	percpu_counter_sub(fbc, nblocks);
+	return 0;
+}
+
 /**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
@@ -1623,18 +1649,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 		sbi->s_resuid != current->fsuid &&
 		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 		root_blocks = ext4_r_blocks_count(sbi->s_es);
-#ifdef CONFIG_SMP
-	if (free_blocks - root_blocks < FBC_BATCH)
-		free_blocks =
-			percpu_counter_sum(&sbi->s_freeblocks_counter);
-#endif
+
+	if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+		free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+
 	if (free_blocks <= root_blocks)
 		/* we don't have free space */
 		return 0;
 	if (free_blocks - root_blocks < nblocks)
 		return free_blocks - root_blocks;
 	return nblocks;
- }
+}
 
 
 /**
@@ -1713,14 +1738,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		*count = ext4_has_free_blocks(sbi, *count);
-	}
-	if (*count == 0) {
-		*errp = -ENOSPC;
-		return 0;	/*return with ENOSPC error */
+		if (ext4_claim_free_blocks(sbi, *count)) {
+			*errp = -ENOSPC;
+			return 0;	/*return with ENOSPC error */
+		}
 	}
-	num = *count;
-
 	/*
 	 * Check quota for allocation of this block.
 	 */
@@ -1915,9 +1937,13 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
-		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
+		/*
+		 * we allocated less blocks than we
+		 * claimed. Add the difference back.
+		 */
+		percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
+	}
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
 		spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8c701318844..0154c2d0b24 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -983,6 +983,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 						ext4_fsblk_t nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
@@ -1207,6 +1209,17 @@ do {								\
 		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+
 /*
  * Inodes and files operations
  */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b6fa0c4087e..b778d5a33ea 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
 	total = md_needed + nrblocks;
 
-	if (ext4_has_free_blocks(sbi, total) < total) {
+	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		return -ENOSPC;
 	}
-	/* reduce fs free blocks counter */
-	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
 	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0db2ccfa0da..2c10b5058a8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2975,9 +2975,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	 * at write_begin() time for delayed allocation
 	 * do not double accounting
 	 */
-	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
-		percpu_counter_sub(&sbi->s_freeblocks_counter,
-					ac->ac_b_ex.fe_len);
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
+			ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
+		/*
+		 * we allocated less blocks than we calimed
+		 * Add the difference back
+		 */
+		percpu_counter_add(&sbi->s_freeblocks_counter,
+				ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
+	}
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4389,14 +4395,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		ar->len = ext4_has_free_blocks(sbi, ar->len);
-	}
-
-	if (ar->len == 0) {
-		*errp = -ENOSPC;
-		return 0;
+		if (ext4_claim_free_blocks(sbi, ar->len)) {
+			*errp = -ENOSPC;
+			return 0;
+		}
 	}
-
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
 		ar->len--;
-- 
cgit v1.2.3


From 030ba6bc67b4f2bc5cd174f57785a1745c929abe Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 8 Sep 2008 23:14:50 -0400
Subject: ext4: Retry block reservation

During block reservation if we don't have enough blocks left, retry
block reservation with smaller block counts.  This makes sure we try
fallocate and DIO with smaller request size and don't fail early.  The
delayed allocation reservation cannot try with smaller block count. So
retry block reservation to handle temporary disk full conditions.  Also
print free blocks details if we fail block allocation during writepages.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  8 +++++++-
 fs/ext4/inode.c   | 14 +++++++++++---
 fs/ext4/mballoc.c |  7 ++++++-
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1707850301d..57909882c08 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1738,10 +1738,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		if (ext4_claim_free_blocks(sbi, *count)) {
+		while (*count && ext4_claim_free_blocks(sbi, *count)) {
+			/* let others to free the space */
+			yield();
+			*count = *count >> 1;
+		}
+		if (!*count) {
 			*errp = -ENOSPC;
 			return 0;	/*return with ENOSPC error */
 		}
+		num = *count;
 	}
 	/*
 	 * Check quota for allocation of this block.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b778d5a33ea..eb9d449817d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1521,6 +1521,7 @@ static int ext4_journalled_write_end(struct file *file,
 
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
+	int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned long md_needed, mdblocks, total = 0;
 
@@ -1529,6 +1530,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
+repeat:
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
 	mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1539,6 +1541,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 
 	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+			yield();
+			goto repeat;
+		}
 		return -ENOSPC;
 	}
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1825,20 +1831,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err = 0;
+	struct buffer_head new;
 	struct buffer_head *lbh = &mpd->lbh;
 	sector_t next = lbh->b_blocknr;
-	struct buffer_head new;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
 		return 0;
-
 	new.b_state = lbh->b_state;
 	new.b_blocknr = 0;
 	new.b_size = lbh->b_size;
-
 	/*
 	 * If we didn't accumulate anything
 	 * to write simply return
@@ -1871,6 +1875,10 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 				  lbh->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_EMERG "This should not happen.!! "
 					"Data will be lost\n");
+		if (err == -ENOSPC) {
+			printk(KERN_CRIT "Total free blocks count %lld\n",
+				ext4_count_free_blocks(mpd->inode->i_sb));
+		}
 		/* invlaidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
 				lbh->b_size >> mpd->inode->i_blkbits);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2c10b5058a8..e4f30de11a9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4395,7 +4395,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		if (ext4_claim_free_blocks(sbi, ar->len)) {
+		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+			/* let others to free the space */
+			yield();
+			ar->len = ar->len >> 1;
+		}
+		if (!ar->len) {
 			*errp = -ENOSPC;
 			return 0;
 		}
-- 
cgit v1.2.3


From 6bc6e63fcd7dac9e633ea29f1fddd9580ab28f3f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 10 Oct 2008 09:39:00 -0400
Subject: ext4: Add percpu dirty block accounting.

This patch adds dirty block accounting using percpu_counters.  Delayed
allocation block reservation is now done by updating dirty block
counter.  In a later patch we switch to non delalloc mode if the
filesystem free blocks is greater than 150% of total filesystem dirty
blocks

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao<cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 62 +++++++++++++++++++++++++++++++++++--------------------
 fs/ext4/ext4_sb.h |  1 +
 fs/ext4/inode.c   | 22 ++++++++++----------
 fs/ext4/mballoc.c | 31 +++++++++++++---------------
 fs/ext4/super.c   |  8 ++++++-
 5 files changed, 73 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 57909882c08..edef0023e6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1605,26 +1605,38 @@ out:
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 						ext4_fsblk_t nblocks)
 {
-	s64 free_blocks;
+	s64 free_blocks, dirty_blocks;
 	ext4_fsblk_t root_blocks = 0;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
-	free_blocks = percpu_counter_read(fbc);
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
 
 	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
 		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 		root_blocks = ext4_r_blocks_count(sbi->s_es);
 
-	if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
-		free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
-
-	if (free_blocks < (root_blocks + nblocks))
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum(fbc);
+		dirty_blocks = percpu_counter_sum(dbc);
+		if (dirty_blocks < 0) {
+			printk(KERN_CRIT "Dirty block accounting "
+					"went wrong %lld\n",
+					dirty_blocks);
+		}
+	}
+	/* Check whether we have space after
+	 * accounting for current dirty blocks
+	 */
+	if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
 		/* we don't have free space */
 		return -ENOSPC;
 
-	/* reduce fs free blocks counter */
-	percpu_counter_sub(fbc, nblocks);
+	/* Add the blocks to nblocks */
+	percpu_counter_add(dbc, nblocks);
 	return 0;
 }
 
@@ -1640,23 +1652,28 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 						ext4_fsblk_t nblocks)
 {
-	ext4_fsblk_t free_blocks;
+	ext4_fsblk_t free_blocks, dirty_blocks;
 	ext4_fsblk_t root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
 
 	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
 		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 		root_blocks = ext4_r_blocks_count(sbi->s_es);
 
-	if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
-		free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
-
-	if (free_blocks <= root_blocks)
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum_positive(fbc);
+		dirty_blocks = percpu_counter_sum_positive(dbc);
+	}
+	if (free_blocks <= (root_blocks + dirty_blocks))
 		/* we don't have free space */
 		return 0;
-	if (free_blocks - root_blocks < nblocks)
+	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
 		return free_blocks - root_blocks;
 	return nblocks;
 }
@@ -1943,13 +1960,14 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
-		/*
-		 * we allocated less blocks than we
-		 * claimed. Add the difference back.
-		 */
-		percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
-	}
+	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+	/*
+	 * Now reduce the dirty block count also. Should not go negative
+	 */
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, *count);
+	else
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
 		spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 69810a25253..a5577e0ccd3 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -59,6 +59,7 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
 	struct blockgroup_lock s_blockgroup_lock;
 
 	/* root of the per fs reservation window tree */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index eb9d449817d..7875a2dd54b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1030,19 +1030,20 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
 
-	/* Account for allocated meta_blocks */
-	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
-	/* update fs free blocks counter for truncate case */
-	percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+	if (mdb_free) {
+		/* Account for allocated meta_blocks */
+		mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+		/* update fs dirty blocks counter */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+		EXT4_I(inode)->i_allocated_meta_blocks = 0;
+		EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	}
 
 	/* update per-inode reservations */
 	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
 	EXT4_I(inode)->i_reserved_data_blocks -= used;
 
-	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-	EXT4_I(inode)->i_allocated_meta_blocks = 0;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
 
@@ -1588,8 +1589,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	release = to_free + mdb_free;
 
-	/* update fs free blocks counter for truncate case */
-	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+	/* update fs dirty blocks counter for truncate case */
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
 
 	/* update per-inode reservations */
 	BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -2471,7 +2472,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
-
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e4f30de11a9..585c2595018 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2880,7 +2880,7 @@ void exit_ext4_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle)
+				handle_t *handle, unsigned long reserv_blks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_super_block *es;
@@ -2969,21 +2969,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-
+	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 	/*
-	 * free blocks account has already be reduced/reserved
-	 * at write_begin() time for delayed allocation
-	 * do not double accounting
+	 * Now reduce the dirty block count also. Should not go negative
 	 */
-	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
-			ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
-		/*
-		 * we allocated less blocks than we calimed
-		 * Add the difference back
-		 */
-		percpu_counter_add(&sbi->s_freeblocks_counter,
-				ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
-	}
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		/* release all the reserved blocks if non delalloc */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+	else
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+						ac->ac_b_ex.fe_len);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4376,12 +4371,13 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 				 struct ext4_allocation_request *ar, int *errp)
 {
+	int freed;
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	int freed;
-	int inquota;
+	unsigned long inquota;
+	unsigned long reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
@@ -4404,6 +4400,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			*errp = -ENOSPC;
 			return 0;
 		}
+		reserv_blks = ar->len;
 	}
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4450,7 +4447,7 @@ repeat:
 	}
 
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		*errp = ext4_mb_mark_diskspace_used(ac, handle);
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
 		if (*errp ==  -EAGAIN) {
 			ac->ac_b_ex.fe_group = 0;
 			ac->ac_b_ex.fe_start = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7de6ca0c9e9..efa40d9d379 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -520,6 +520,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -2259,6 +2260,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirs_counter,
 				ext4_count_dirs(sb));
 	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	}
 	if (err) {
 		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
 		goto failed_mount3;
@@ -2491,6 +2495,7 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -3169,7 +3174,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
 	ext4_free_blocks_count_set(es, buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
-- 
cgit v1.2.3


From 79f0be8d2e6ebde27dfb3beff18eb689d5c4e36c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 8 Oct 2008 23:13:30 -0400
Subject: ext4: Switch to non delalloc mode when we are low on free blocks
 count.

The delayed allocation code allocates blocks during writepages(), which
can not handle block allocation failures.  To deal with this, we switch
away from delayed allocation mode when we are running low on free
blocks.  This also allows us to avoid needing to reserve a large number
of meta-data blocks in case all of the requested blocks are
discontiguous.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7875a2dd54b..b1a6a7373f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2458,6 +2458,33 @@ out_writepages:
 	return ret;
 }
 
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+	s64 free_blocks, dirty_blocks;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/*
+	 * switch to non delalloc mode if we are running low
+	 * on free block. The free block accounting via percpu
+	 * counters can get slightly wrong with FBC_BATCH getting
+	 * accumulated on each CPU without updating global counters
+	 * Delalloc need an accurate free block accounting. So switch
+	 * to non delalloc when we are near to error range.
+	 */
+	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+	if (2 * free_blocks < 3 * dirty_blocks ||
+		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+		/*
+		 * free block count is less that 150% of dirty blocks
+		 * or free blocks is less that watermark
+		 */
+		return 1;
+	}
+	return 0;
+}
+
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
@@ -2472,6 +2499,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
+
+	if (ext4_nonda_switch(inode->i_sb)) {
+		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+		return ext4_write_begin(file, mapping, pos,
+					len, flags, pagep, fsdata);
+	}
+	*fsdata = (void *)0;
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2540,6 +2574,19 @@ static int ext4_da_write_end(struct file *file,
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
+	int write_mode = (int)(unsigned long)fsdata;
+
+	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+		if (ext4_should_order_data(inode)) {
+			return ext4_ordered_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		} else if (ext4_should_writeback_data(inode)) {
+			return ext4_writeback_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		} else {
+			BUG();
+		}
+	}
 
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
@@ -4877,6 +4924,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
+	void *fsdata;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -4915,11 +4963,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	 * on the same page though
 	 */
 	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
-			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
-			len, len, page, NULL);
+			len, len, page, fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = 0;
-- 
cgit v1.2.3


From 5c79161689aede2d487d707d5931a22eadf66120 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 8 Oct 2008 23:12:24 -0400
Subject: ext4: Signed arithmetic fix

This patch converts some usage of ext4_fsblk_t to s64.  This is needed
so that some of the sign conversion works as expected in if loops.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 19 ++++++++++---------
 fs/ext4/ext4.h   |  5 ++---
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index edef0023e6e..a425e78c73e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1603,10 +1603,10 @@ out:
 }
 
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks)
+						s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks;
-	ext4_fsblk_t root_blocks = 0;
+	s64 root_blocks = 0;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
 	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
@@ -1631,7 +1631,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 	/* Check whether we have space after
 	 * accounting for current dirty blocks
 	 */
-	if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
+	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
 		/* we don't have free space */
 		return -ENOSPC;
 
@@ -1650,10 +1650,10 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  * On success, return nblocks
  */
 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks)
+						s64 nblocks)
 {
-	ext4_fsblk_t free_blocks, dirty_blocks;
-	ext4_fsblk_t root_blocks = 0;
+	s64 free_blocks, dirty_blocks;
+	s64 root_blocks = 0;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
 	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
@@ -1667,14 +1667,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum_positive(fbc);
-		dirty_blocks = percpu_counter_sum_positive(dbc);
+		free_blocks  = percpu_counter_sum(fbc);
+		dirty_blocks = percpu_counter_sum(dbc);
 	}
 	if (free_blocks <= (root_blocks + dirty_blocks))
 		/* we don't have free space */
 		return 0;
+
 	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
-		return free_blocks - root_blocks;
+		return free_blocks - (root_blocks + dirty_blocks);
 	return nblocks;
 }
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0154c2d0b24..e13b9deee86 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -983,10 +983,9 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks);
+					 s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-- 
cgit v1.2.3


From 68629f29c6764c37ebdceec2f6bbef6637eaf420 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 8 Sep 2008 23:09:17 -0400
Subject: ext4: Fix ext4 nomballoc allocator for ENOSPC

We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
on the filesystem.

The patch includes two changes:

a) Set reservation to NULL if we trying to allocate near group_target_block
from the goal group if the free block in the group is less than windows.
This should give us a better chance to allocate near group_target_block.
This also ensures that if we are not allocating near group_target_block
then we don't trun off reservation. This should enable us to allocate
with reservation from other groups that have large free blocks count.

b) we don't need to check the window size if the block reservation is off.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a425e78c73e..ae26c37e398 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1804,15 +1804,17 @@ retry_alloc:
 		goto io_error;
 
 	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	/*
-	 * if there is not enough free blocks to make a new resevation
-	 * turn off reservation for this allocation
-	 */
-	if (my_rsv && (free_blocks < windowsz)
-		&& (rsv_is_empty(&my_rsv->rsv_window)))
-		my_rsv = NULL;
 
 	if (free_blocks > 0) {
+		/*
+		 * try to allocate with group target block
+		 * in the goal group. If we have low free_blocks
+		 * count turn off reservation
+		 */
+		if (my_rsv && (free_blocks < windowsz)
+			&& (rsv_is_empty(&my_rsv->rsv_window)))
+			my_rsv = NULL;
+
 		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
@@ -1845,7 +1847,7 @@ retry_alloc:
 		 * free blocks is less than half of the reservation
 		 * window size.
 		 */
-		if (free_blocks <= (windowsz/2))
+		if (my_rsv && (free_blocks <= (windowsz/2)))
 			continue;
 
 		brelse(bitmap_bh);
-- 
cgit v1.2.3


From 166348dd37a4baacfb6fe495954b56f56b116f0c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 8 Sep 2008 23:08:40 -0400
Subject: ext4: Don't add the inode to journal handle until after the block is
 allocated

Make sure we don't add the inode to the journal handle until after the
block allocation, so that a journal commit will not include the inode in
case of block allocation failure.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c |  2 +-
 fs/ext4/inode.c  | 36 +++++++++++++++---------------------
 2 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ae26c37e398..cca7fd53ad7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -2063,7 +2063,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Account for the allocated meta blocks
 	 */
-	if (!(*errp)) {
+	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += *count;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b1a6a7373f0..b567e71f5be 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2170,18 +2170,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle_t *handle = NULL;
 
 	handle = ext4_journal_current_handle();
-	if (!handle) {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, 0, 0, 0);
-		BUG_ON(!ret);
-	} else {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-	}
-
+	BUG_ON(!handle);
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+			bh_result, create, 0, EXT4_DELALLOC_RSVED);
 	if (ret > 0) {
+
 		bh_result->b_size = (ret << inode->i_blkbits);
 
+		if (ext4_should_order_data(inode)) {
+			int retval;
+			retval = ext4_jbd2_file_inode(handle, inode);
+			if (retval)
+				/*
+				 * Failed to add inode for ordered
+				 * mode. Don't update file size
+				 */
+				return retval;
+		}
+
 		/*
 		 * Update on-disk size along with block allocation
 		 * we don't use 'extend_disksize' as size may change
@@ -2407,18 +2413,6 @@ restart_loop:
 			dump_stack();
 			goto out_writepages;
 		}
-		if (ext4_should_order_data(inode)) {
-			/*
-			 * With ordered mode we need to add
-			 * the inode to the journal handl
-			 * when we do block allocation.
-			 */
-			ret = ext4_jbd2_file_inode(handle, inode);
-			if (ret) {
-				ext4_journal_stop(handle);
-				goto out_writepages;
-			}
-		}
 
 		to_write -= wbc->nr_to_write;
 		ret = mpage_da_writepages(mapping, wbc,
-- 
cgit v1.2.3


From df22291ff0fde0d350cf15dac3e5cc33ac528875 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 8 Sep 2008 23:05:34 -0400
Subject: ext4: Retry block allocation if we have free blocks left

When we truncate files, the meta-data blocks released are not reused
untill we commit the truncate transaction.  That means delayed get_block
request will return ENOSPC even if we have free blocks left.  Force a
journal commit and retry block allocation if we get ENOSPC with free
blocks left.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 81 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b567e71f5be..f97b3478eb8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1634,6 +1634,7 @@ struct mpage_da_data {
 	struct writeback_control *wbc;
 	int io_done;
 	long pages_written;
+	int retval;
 };
 
 /*
@@ -1820,6 +1821,24 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 	return;
 }
 
+static void ext4_print_free_blocks(struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	printk(KERN_EMERG "Total free blocks count %lld\n",
+			ext4_count_free_blocks(inode->i_sb));
+	printk(KERN_EMERG "Free/Dirty block details\n");
+	printk(KERN_EMERG "free_blocks=%lld\n",
+			percpu_counter_sum(&sbi->s_freeblocks_counter));
+	printk(KERN_EMERG "dirty_blocks=%lld\n",
+			percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+	printk(KERN_EMERG "Block reservation details\n");
+	printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+			EXT4_I(inode)->i_reserved_data_blocks);
+	printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+			EXT4_I(inode)->i_reserved_meta_blocks);
+	return;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -1834,7 +1853,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 	int err = 0;
 	struct buffer_head new;
 	struct buffer_head *lbh = &mpd->lbh;
-	sector_t next = lbh->b_blocknr;
+	sector_t next;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
@@ -1844,6 +1863,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 	new.b_state = lbh->b_state;
 	new.b_blocknr = 0;
 	new.b_size = lbh->b_size;
+	next = lbh->b_blocknr;
 	/*
 	 * If we didn't accumulate anything
 	 * to write simply return
@@ -1860,6 +1880,13 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 		 */
 		if (err == -EAGAIN)
 			return 0;
+
+		if (err == -ENOSPC &&
+				ext4_count_free_blocks(mpd->inode->i_sb)) {
+			mpd->retval = err;
+			return 0;
+		}
+
 		/*
 		 * get block failure will cause us
 		 * to loop in writepages. Because
@@ -1877,8 +1904,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 		printk(KERN_EMERG "This should not happen.!! "
 					"Data will be lost\n");
 		if (err == -ENOSPC) {
-			printk(KERN_CRIT "Total free blocks count %lld\n",
-				ext4_count_free_blocks(mpd->inode->i_sb));
+			ext4_print_free_blocks(mpd->inode);
 		}
 		/* invlaidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
@@ -2085,39 +2111,36 @@ static int __mpage_da_writepage(struct page *page,
  */
 static int mpage_da_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc,
-			       get_block_t get_block)
+			       struct mpage_da_data *mpd)
 {
-	struct mpage_da_data mpd;
 	long to_write;
 	int ret;
 
-	if (!get_block)
+	if (!mpd->get_block)
 		return generic_writepages(mapping, wbc);
 
-	mpd.wbc = wbc;
-	mpd.inode = mapping->host;
-	mpd.lbh.b_size = 0;
-	mpd.lbh.b_state = 0;
-	mpd.lbh.b_blocknr = 0;
-	mpd.first_page = 0;
-	mpd.next_page = 0;
-	mpd.get_block = get_block;
-	mpd.io_done = 0;
-	mpd.pages_written = 0;
+	mpd->lbh.b_size = 0;
+	mpd->lbh.b_state = 0;
+	mpd->lbh.b_blocknr = 0;
+	mpd->first_page = 0;
+	mpd->next_page = 0;
+	mpd->io_done = 0;
+	mpd->pages_written = 0;
+	mpd->retval = 0;
 
 	to_write = wbc->nr_to_write;
 
-	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
 
 	/*
 	 * Handle last extent of pages
 	 */
-	if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-		if (mpage_da_map_blocks(&mpd) == 0)
-			mpage_da_submit_io(&mpd);
+	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+		if (mpage_da_map_blocks(mpd) == 0)
+			mpage_da_submit_io(mpd);
 	}
 
-	wbc->nr_to_write = to_write - mpd.pages_written;
+	wbc->nr_to_write = to_write - mpd->pages_written;
 	return ret;
 }
 
@@ -2357,6 +2380,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 {
 	handle_t *handle = NULL;
 	loff_t range_start = 0;
+	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	long to_write, pages_skipped = 0;
@@ -2390,6 +2414,9 @@ static int ext4_da_writepages(struct address_space *mapping,
 	range_start =  wbc->range_start;
 	pages_skipped = wbc->pages_skipped;
 
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
+
 restart_loop:
 	to_write = wbc->nr_to_write;
 	while (!ret && to_write > 0) {
@@ -2413,11 +2440,17 @@ restart_loop:
 			dump_stack();
 			goto out_writepages;
 		}
-
 		to_write -= wbc->nr_to_write;
-		ret = mpage_da_writepages(mapping, wbc,
-					  ext4_da_get_block_write);
+
+		mpd.get_block = ext4_da_get_block_write;
+		ret = mpage_da_writepages(mapping, wbc, &mpd);
+
 		ext4_journal_stop(handle);
+
+		if (mpd.retval == -ENOSPC)
+			jbd2_journal_force_commit_nested(sbi->s_journal);
+
+		/* reset the retry count */
 		if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
 			 * got one extent now try with
-- 
cgit v1.2.3


From ae4d537211ff250a8c23c4f1227c4276cd2508ab Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 13 Sep 2008 13:10:25 -0400
Subject: ext4: truncate block allocated on a failed ext4_write_begin

For blocksize < pagesize we need to remove blocks that got allocated in
block_write_begin() if we fail with ENOSPC for later blocks.
block_write_begin() internally does this if it allocated pages locally.
This makes sure we don't have blocks outside inode.i_size during ENOSPC.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f97b3478eb8..634f0bc7570 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1391,6 +1391,13 @@ retry:
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2560,6 +2567,13 @@ retry:
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-- 
cgit v1.2.3


From cf17fea6575cb1739552e1d0cb2b446305ee3d0c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 13 Sep 2008 13:06:18 -0400
Subject: ext4: Properly update i_disksize.

With delayed allocation we use i_data_sem to update i_disksize.  We need
to update i_disksize only if the new size specified is greater than the
current value and we need to make sure we don't race with other
i_disksize update.  With delayed allocation we will switch to the
write_begin function for non-delayed allocation if we are low on free
blocks.  This means the write_begin function for non-delayed allocation
also needs to use the same locking.

We also need to check and update i_disksize even if the new size is less
that inode.i_size because of delayed allocation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 11 +++++++++++
 fs/ext4/extents.c |  9 +++++----
 fs/ext4/inode.c   | 54 ++++++++++++++++++++++++++++++------------------------
 3 files changed, 46 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e13b9deee86..3e47b99a763 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1218,6 +1218,17 @@ do {								\
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
 
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	/*
+	 * XXX: replace with spinlock if seen contended -bzzz
+	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ;
+}
 
 /*
  * Inodes and files operations
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 797f0602a68..e8758df2617 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2878,10 +2878,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
 	 * Update only when preallocation was requested beyond
 	 * the file size.
 	 */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-				new_size > i_size_read(inode)) {
-		i_size_write(inode, new_size);
-		EXT4_I(inode)->i_disksize = new_size;
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
 	}
 
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 634f0bc7570..22fcbb67cd8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1434,16 +1434,18 @@ static int ext4_ordered_write_end(struct file *file,
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
-		/*
-		 * generic_write_end() will run mark_inode_dirty() if i_size
-		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-		 * into that.
-		 */
 		loff_t new_i_size;
 
 		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = new_i_size;
+		if (new_i_size > EXT4_I(inode)->i_disksize) {
+			ext4_update_i_disksize(inode, new_i_size);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
+
 		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
@@ -1468,8 +1470,14 @@ static int ext4_writeback_write_end(struct file *file,
 	loff_t new_i_size;
 
 	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = new_i_size;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_mark_inode_dirty(handle, inode);
+	}
 
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
@@ -1494,6 +1502,7 @@ static int ext4_journalled_write_end(struct file *file,
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
+	loff_t new_i_size;
 
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1508,11 +1517,12 @@ static int ext4_journalled_write_end(struct file *file,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
-	if (pos+copied > inode->i_size)
+	new_i_size = pos + copied;
+	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
 	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-	if (inode->i_size > EXT4_I(inode)->i_disksize) {
-		EXT4_I(inode)->i_disksize = inode->i_size;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
@@ -2227,18 +2237,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
 		if (disksize > EXT4_I(inode)->i_disksize) {
-			/*
-			 * XXX: replace with spinlock if seen contended -bzzz
-			 */
-			down_write(&EXT4_I(inode)->i_data_sem);
-			if (disksize > EXT4_I(inode)->i_disksize)
-				EXT4_I(inode)->i_disksize = disksize;
-			up_write(&EXT4_I(inode)->i_data_sem);
-
-			if (EXT4_I(inode)->i_disksize == disksize) {
-				ret = ext4_mark_inode_dirty(handle, inode);
-				return ret;
-			}
+			ext4_update_i_disksize(inode, disksize);
+			ret = ext4_mark_inode_dirty(handle, inode);
+			return ret;
 		}
 		ret = 0;
 	}
@@ -2654,6 +2655,11 @@ static int ext4_da_write_end(struct file *file,
 				EXT4_I(inode)->i_disksize = new_i_size;
 			}
 			up_write(&EXT4_I(inode)->i_data_sem);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
-- 
cgit v1.2.3


From 9d9f177572d9e4eba0f2e18523b44f90dd51fe74 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 9 Oct 2008 11:15:52 -0400
Subject: ext4: Avoid printk floods in the face of directory corruption

Note: some people thinks this represents a security bug, since it
might make the system go away while it is printing a large number of
console messages, especially if a serial console is involved.  Hence,
it has been assigned CVE-2008-3528, but it requires that the attacker
either has physical access to your machine to insert a USB disk with a
corrupted filesystem image (at which point why not just hit the power
button), or is otherwise able to convince the system administrator to
mount an arbitrary filesystem image (at which point why not just
include a setuid shell or world-writable hard disk device file or some
such).  Me, I think they're just being silly. --tytso

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
Cc: Eugene Teo <eugeneteo@kernel.sg>
---
 fs/ext4/dir.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d40da316921..3ca6a2b7632 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file *filp,
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	int dir_has_error = 0;
 
 	sb = inode->i_sb;
 
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file *filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext4_error(sb, "ext4_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				ext4_error(sb, __func__, "directory #%lu "
+					   "contains a hole at offset %Lu",
+					   inode->i_ino,
+					   (unsigned long long) filp->f_pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (filp->f_pos > inode->i_blocks << 9)
 				break;
-- 
cgit v1.2.3


From c62a11fd9555007b1caab83b5bcbb443a43e32bb Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Mon, 8 Sep 2008 10:20:24 -0400
Subject: Update flex_bg free blocks and free inodes counters when resizing.

This fixes a bug which prevented the newly created inodes after a
resize from being used on filesystems with flex_bg.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 9 +++++++++
 fs/ext4/super.c  | 7 +++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4392e3fd0f0..b60afbcd7e4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -929,6 +929,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, input->group);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			input->free_blocks_count;
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			EXT4_INODES_PER_GROUP(sb);
+	}
+
 	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
 	sb->s_dirt = 1;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index efa40d9d379..f58cc0309dc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1507,8 +1507,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
 	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
 
-	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
-		groups_per_flex;
+	/* We allocate both existing and potentially added groups */
+	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
+			   groups_per_flex;
 	sbi->s_flex_groups = kzalloc(flex_group_count *
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
-- 
cgit v1.2.3


From 899fc1a4cf404747de2666534d508804597ee22f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 14 Sep 2008 10:21:33 -0400
Subject: ext4: fix #11321: create /proc/ext4/*/stats more carefully

ext4 creates per-suberblock directory in /proc/ext4/ . Name used as
basis is taken from bdevname, which, surprise, can contain slash.

However, proc while allowing to use proc_create("a/b", parent) form of
PDE creation, assumes that parent/a was already created.

bdevname in question is 'cciss/c0d0p9', directory is not created and all
this stuff goes directly into /proc (which is real bug).

Warning comes when _second_ partition is mounted.

http://bugzilla.kernel.org/show_bug.cgi?id=11321

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 585c2595018..9122271e3d6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2786,14 +2786,20 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct proc_dir_entry *proc;
-	char devname[64];
+	char devname[BDEVNAME_SIZE], *p;
 
 	if (proc_root_ext4 == NULL) {
 		sbi->s_mb_proc = NULL;
 		return -EINVAL;
 	}
 	bdevname(sb->s_bdev, devname);
+	p = devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+
 	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
+	if (!sbi->s_mb_proc)
+		goto err_create_dir;
 
 	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
 	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
@@ -2805,7 +2811,6 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	return 0;
 
 err_out:
-	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
 	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
@@ -2814,6 +2819,8 @@ err_out:
 	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
 	remove_proc_entry(devname, proc_root_ext4);
 	sbi->s_mb_proc = NULL;
+err_create_dir:
+	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
 
 	return -ENOMEM;
 }
@@ -2821,12 +2828,15 @@ err_out:
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char devname[64];
+	char devname[BDEVNAME_SIZE], *p;
 
 	if (sbi->s_mb_proc == NULL)
 		return -EINVAL;
 
 	bdevname(sb->s_bdev, devname);
+	p = devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
 	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-- 
cgit v1.2.3


From 05496769e5da83ce22ed97345afd9c7b71d6bd24 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 16 Sep 2008 14:36:17 -0400
Subject: jbd2: clean up how the journal device name is printed

Calculate the journal device name once and stash it away in the
journal_s structure.  This avoids needing to call bdevname()
everywhere and reduces stack usage by not needing to allocate an
on-stack buffer.  In addition, we eliminate the '/' that can appear in
device names (e.g. "cciss/c0d0p9" --- see kernel bugzilla #11321) that
can cause problems when creating proc directory names, and include the
inode number to support ocfs2 which creates multiple journals with
different inode numbers.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c   | 12 +++---------
 fs/jbd2/commit.c  | 11 +++--------
 fs/jbd2/journal.c | 48 ++++++++++++++++--------------------------------
 3 files changed, 22 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f58cc0309dc..64e1c21eb5d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1476,15 +1476,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
-		char b[BDEVNAME_SIZE];
-
-		printk(KERN_INFO "EXT4 FS on %s, external journal on %s\n",
-		       sb->s_id, bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
-	} else {
-		printk(KERN_INFO "EXT4 FS on %s, internal journal\n",
-		       sb->s_id);
-	}
+	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+	       "external", EXT4_SB(sb)->s_journal->j_devname);
 	return res;
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95e..b091e5378fe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
 	 * to remember if we sent a barrier request
 	 */
 	if (ret == -EOPNOTSUPP && barrier_done) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
-			"JBD: barrier-based sync failed on %s - "
-			"disabling barriers\n",
-			bdevname(journal->j_dev, b));
+		       "JBD: barrier-based sync failed on %s - "
+		       "disabling barriers\n", journal->j_devname);
 		spin_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
 		spin_unlock(&journal->j_state_lock);
@@ -681,11 +678,9 @@ start_journal_io:
 	 */
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 	if (err) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
 			"JBD2: Detected IO errors while flushing file data "
-			"on %s\n", bdevname(journal->j_fs_dev, b));
+		       "on %s\n", journal->j_devname);
 		err = 0;
 	}
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4ed..81186a29742 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 		if (ret)
 			*retp = ret;
 		else {
-			char b[BDEVNAME_SIZE];
-
 			printk(KERN_ALERT "%s: journal block not found "
 					"at offset %lu on %s\n",
-				__func__,
-				blocknr,
-				bdevname(journal->j_dev, b));
+			       __func__, blocknr, journal->j_devname);
 			err = -EIO;
 			__journal_abort_soft(journal, err);
 		}
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
 
 static void jbd2_stats_proc_init(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
-	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
 		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
 				 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
 
 static void jbd2_stats_proc_exit(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
 	remove_proc_entry("info", journal->j_proc_entry);
 	remove_proc_entry("history", journal->j_proc_entry);
-	remove_proc_entry(name, proc_jbd2_stats);
+	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 
 static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 {
 	journal_t *journal = journal_init_common();
 	struct buffer_head *bh;
+	char *p;
 	int n;
 
 	if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
 	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 {
 	struct buffer_head *bh;
 	journal_t *journal = journal_init_common();
+	char *p;
 	int err;
 	int n;
 	unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 
 	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
 	journal->j_inode = inode;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	p = journal->j_devname + strlen(journal->j_devname);
+	sprintf(p, ":%lu", journal->j_inode->i_ino);
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
 		  journal, inode->i_sb->s_id, inode->i_ino,
@@ -1760,23 +1762,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	return err;
 }
 
-/*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
-	struct block_device *bdev;
-
-	if (journal->j_inode)
-		bdev = journal->j_inode->i_sb->s_bdev;
-	else
-		bdev = journal->j_dev;
-
-	return bdevname(bdev, buffer);
-}
-
 /*
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
@@ -1793,13 +1778,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
 void __jbd2_journal_abort_hard(journal_t *journal)
 {
 	transaction_t *transaction;
-	char b[BDEVNAME_SIZE];
 
 	if (journal->j_flags & JBD2_ABORT)
 		return;
 
 	printk(KERN_ERR "Aborting journal on device %s.\n",
-		journal_dev_name(journal, b));
+	       journal->j_devname);
 
 	spin_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
-- 
cgit v1.2.3


From 7ee1ec4ca30c6df8e989615cdaacb75f2af4fa6b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 8 Sep 2008 10:47:19 -0400
Subject: ext4: add missing unlock in ext4_check_descriptors() on error path

If there group descriptors are corrupted we need unlock the block
group lock before returning from the function; else we will oops when
freeing a spinlock which is still being held.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64e1c21eb5d..8175318abd8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1623,8 +1623,10 @@ static int ext4_check_descriptors(struct super_block *sb)
 			       "Checksum for group %lu failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
-			if (!(sb->s_flags & MS_RDONLY))
+			if (!(sb->s_flags & MS_RDONLY)) {
+				spin_unlock(sb_bgl_lock(sbi, i));
 				return 0;
+			}
 		}
 		spin_unlock(sb_bgl_lock(sbi, i));
 		if (!flexbg_flag)
-- 
cgit v1.2.3


From 5c89468c12899b84886cb47eec93f0c88e0f896a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Sep 2008 19:44:17 +0200
Subject: udf: add llseek method

UDF currently doesn't set a llseek method for regular files, which
means it will fall back to default_llseek.  This means no one can seek
beyond 2 Gigabytes on udf, and that there's not protection vs
the i_size updates from writers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d..eb91f3b7032 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
 	.release		= udf_release_file,
 	.fsync			= udf_fsync_file,
 	.splice_read		= generic_file_splice_read,
+	.llseek			= generic_file_llseek,
 };
 
 const struct inode_operations udf_file_inode_operations = {
-- 
cgit v1.2.3


From af904deaf6da3f3285eb0a06a3dc6a1af0251030 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 8 Sep 2008 11:58:13 -0400
Subject: NFS: Restore missing hunk in NFS mount option parser

Automounter maps can contain mount options valid for other NFS
implementations but not for Linux.  The Linux automounter uses the
mount command's "-s" command line option ("s" for "sloppy") so that
mount requests containing such options are not rejected.

Commit f45663ce5fb30f76a3414ab3ac69f4dd320e760a attempted to address a
known regression with text-based NFS mount option parsing.  Unrecognized
mount options would cause mount requests to fail, even if the "-s"
option was used on the mount command line.

Unfortunately, this commit was not complete as submitted.  It adds a
new mount option, "sloppy".  But it is missing a hunk, so it now allows
NFS mounts with unrecognized mount options, even if the "sloppy" option
is not present.  This could be a problem if a required critical mount
option such as "sync" is misspelled, for example, and is considered a
regression from 2.6.26.

This patch restores the missing hunk.  Now, the default behavior of
text-based NFS mount options is as before: any unrecognized mount option
will cause the mount to fail.

Please include this in 2.6.27-rc.

Thanks to Neil Brown for reporting this.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9abcd2b329f..e9b20173fef 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,
 		}
 	}
 
+	if (errors > 0) {
+		dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+				errors, (errors == 1 ? "" : "s"));
+		if (!sloppy)
+			return 0;
+	}
 	return 1;
 
 out_nomem:
-- 
cgit v1.2.3


From 0e116227a01580acf47437adba3afadf21b6bd5f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 3 Sep 2008 01:57:14 +0800
Subject: ocfs2: Fix a bug in direct IO read.

ocfs2 will become read-only if we try to read the bytes which pass
the end of i_size. This can be easily reproduced by following steps:
1. mkfs a ocfs2 volume with bs=4k cs=4k and nosparse.
2. create a small file(say less than 100 bytes) and we will create the file
   which is allocated 1 cluster.
3. read 8196 bytes from the kernel using O_DIRECT which exceeds the limit.
4. The ocfs2 volume becomes read-only and dmesg shows:
OCFS2: ERROR (device sda13): ocfs2_direct_IO_get_blocks:
Inode 66010 has a hole at block 1
File system is now read-only due to the potential of on-disk corruption.
Please run fsck.ocfs2 once the file system is unmounted.

So suppress the ERROR message.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb507..a53da146627 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has a hole at block %llu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-- 
cgit v1.2.3


From 2a43a878001cc5cb7c3c7be2e8dad0a1aeb939b0 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 13 Sep 2008 12:52:26 -0400
Subject: ext4: elevate write count for migrate ioctl

The migrate ioctl writes to the filsystem, so we need to elevate the
write count.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  3 +--
 fs/ext4/ioctl.c   | 21 ++++++++++++++++++++-
 fs/ext4/migrate.c | 10 +---------
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3e47b99a763..9b6ad8f19bc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1084,8 +1084,7 @@ extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
-extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
-		       unsigned long);
+extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ca09dd1039e..9f3044ac699 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -267,7 +267,26 @@ setversion_out:
 	}
 
 	case EXT4_IOC_MIGRATE:
-		return ext4_ext_migrate(inode, filp, cmd, arg);
+	{
+		int err;
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		/*
+		 * inode_mutex prevent write and truncate on the file.
+		 * Read still goes through. We take i_data_sem in
+		 * ext4_ext_swap_inode_data before we switch the
+		 * inode format to prevent read.
+		 */
+		mutex_lock(&(inode->i_mutex));
+		err = ext4_ext_migrate(inode);
+		mutex_unlock(&(inode->i_mutex));
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
 
 	default:
 		return -ENOTTY;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 46fc0b5b12b..f2a9cf498ec 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -447,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 
 }
 
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
-				unsigned int cmd, unsigned long arg)
+int ext4_ext_migrate(struct inode *inode)
 {
 	handle_t *handle;
 	int retval = 0, i;
@@ -515,12 +514,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	 * trascation that created the inode. Later as and
 	 * when we add extents we extent the journal
 	 */
-	/*
-	 * inode_mutex prevent write and truncate on the file. Read still goes
-	 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
-	 * switch the inode format to prevent read.
-	 */
-	mutex_lock(&(inode->i_mutex));
 	/*
 	 * Even though we take i_mutex we can still cause block allocation
 	 * via mmap write to holes. If we have allocated new blocks we fail
@@ -623,7 +616,6 @@ err_out:
 	tmp_inode->i_nlink = 0;
 
 	ext4_journal_stop(handle);
-	mutex_unlock(&(inode->i_mutex));
 
 	if (tmp_inode)
 		iput(tmp_inode);
-- 
cgit v1.2.3


From 4db46fc266b84a04b73b7a4c6ebe8a543a62a2ff Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 8 Oct 2008 23:34:06 -0400
Subject: ext4: hook the ext3 migration interface to the EXT4_IOC_SETFLAGS
 ioctl

This patch hooks the ext3 to ext4 migrate interface to
EXT4_IOC_SETFLAGS ioctl. The userspace interface is via chattr +e.  We
only allow setting extent flags.  Clearing extent flag (migrating from
ext4 to ext3) is not supported.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/ioctl.c | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9b6ad8f19bc..068adc1d560 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -245,7 +245,7 @@ struct flex_groups {
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
 /*
  * Inode dynamic state flags
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9f3044ac699..3e14060b398 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -34,7 +34,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
 		handle_t *handle = NULL;
-		int err;
+		int err, migrate = 0;
 		struct ext4_iloc iloc;
 		unsigned int oldflags;
 		unsigned int jflag;
@@ -82,6 +82,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
 		}
+		if (oldflags & EXT4_EXTENTS_FL) {
+			/* We don't support clearning extent flags */
+			if (!(flags & EXT4_EXTENTS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (flags & EXT4_EXTENTS_FL) {
+			/* migrate the file */
+			migrate = 1;
+			flags &= ~EXT4_EXTENTS_FL;
+		}
 
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
@@ -109,6 +120,10 @@ flags_err:
 
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
 			err = ext4_change_inode_journal_flag(inode, jflag);
+		if (err)
+			goto flags_out;
+		if (migrate)
+			err = ext4_ext_migrate(inode);
 flags_out:
 		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write(filp->f_path.mnt);
-- 
cgit v1.2.3


From 8eea80d52b9d87cfd771055534bd2c24f73704d7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Sep 2008 19:54:35 -0400
Subject: ext4: Renumber EXT4_IOC_MIGRATE

Pick an ioctl number for EXT4_IOC_MIGRATE that won't conflict with
other ext4 ioctl's.  Since there haven't been any major userspace
users of this ioctl, we can afford to change this now, to avoid
potential problems later.

Also, reorder the ioctl numbers in ext4.h to avoid this sort of
mistake in the future.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 068adc1d560..50a4846c7e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
 #define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
 #define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
 #define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
-#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
 #endif
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
-#define EXT4_IOC_MIGRATE		_IO('f', 7)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 
 /*
  * ioctl commands in 32 bit emulation
-- 
cgit v1.2.3


From 730c213c79a638137b47a90624e4bac252f07ae7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 13 Sep 2008 15:23:29 -0400
Subject: ext4: use percpu data structures for lg_prealloc_list

lg_prealloc_list seems to cry out for a per-cpu data structure; on a large
smp system I think this should be better.  I've lightly tested this change
on a 4-cpu system.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9122271e3d6..14ebd572bea 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2541,17 +2541,16 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
 	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
 
-	i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
-	sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
 		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
-	for (i = 0; i < nr_cpu_ids; i++) {
+	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
-		lg = &sbi->s_locality_groups[i];
+		lg = per_cpu_ptr(sbi->s_locality_groups, i);
 		mutex_init(&lg->lg_mutex);
 		for (j = 0; j < PREALLOC_TB_SIZE; j++)
 			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2648,8 +2647,7 @@ int ext4_mb_release(struct super_block *sb)
 				atomic_read(&sbi->s_mb_discarded));
 	}
 
-	kfree(sbi->s_locality_groups);
-
+	free_percpu(sbi->s_locality_groups);
 	ext4_mb_history_release(sb);
 	ext4_mb_destroy_per_dev_proc(sb);
 
@@ -4106,8 +4104,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	 * per cpu locality group is to reduce the contention between block
 	 * request from multiple CPUs.
 	 */
-	ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
-	put_cpu();
+	ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
 
 	/* we're going to use group allocation */
 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
-- 
cgit v1.2.3


From 665020c35e89a9e0643e21561e4f8f967f4f2c4b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 13 Sep 2008 02:33:06 -0700
Subject: proc: more debugging for "already registered" case

Print parent directory name as well.

The aim is to catch non-creation of parent directory when proc_mkdir will
return NULL and all subsequent registrations go directly in /proc instead
of intended directory.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Fixed insane printk string while at it.  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bca0f81eb68..7821589a17d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -547,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 
 	for (tmp = dir->subdir; tmp; tmp = tmp->next)
 		if (strcmp(tmp->name, dp->name) == 0) {
-			printk(KERN_WARNING "proc_dir_entry '%s' already "
-					"registered\n", dp->name);
+			printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+				dir->name, dp->name);
 			dump_stack();
 			break;
 		}
-- 
cgit v1.2.3


From 1558182f651798164418abf53f76786da0ea4a6f Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sat, 13 Sep 2008 02:33:12 -0700
Subject: bfs: fix Lockdep warning

This fixes:

  =============================================
  [ INFO: possible recursive locking detected ]
  2.6.27-rc5-00283-g70bb089 #68
  ---------------------------------------------
  touch/6855 is trying to acquire lock:
   (&info->bfs_lock){--..}, at: [<c02262f5>] bfs_delete_inode+0x9e/0x18c

  but task is already holding lock:
   (&info->bfs_lock){--..}, at: [<c0226c00>] bfs_create+0x45/0x187

  other info that might help us debug this:
  2 locks held by touch/6855:
   #0:  (&type->i_mutex_dir_key#5){--..}, at: [<c018ad13>] do_filp_open+0x10b/0x62f
   #1:  (&info->bfs_lock){--..}, at: [<c0226c00>] bfs_create+0x45/0x187

  stack backtrace:
  Pid: 6855, comm: touch Not tainted 2.6.27-rc5-00283-g70bb089 #68
   [<c013e769>] validate_chain+0x458/0x9f4
   [<c013bece>] ? trace_hardirqs_off+0xb/0xd
   [<c013f36b>] __lock_acquire+0x666/0x6e0
   [<c013f440>] lock_acquire+0x5b/0x77
   [<c02262f5>] ? bfs_delete_inode+0x9e/0x18c
   [<c06aab74>] mutex_lock_nested+0xbc/0x234
   [<c02262f5>] ? bfs_delete_inode+0x9e/0x18c
   [<c02262f5>] ? bfs_delete_inode+0x9e/0x18c
   [<c02262f5>] bfs_delete_inode+0x9e/0x18c
   [<c0226257>] ? bfs_delete_inode+0x0/0x18c
   [<c01925e1>] generic_delete_inode+0x94/0xfe
   [<c019265d>] generic_drop_inode+0x12/0x12f
   [<c0191b7e>] iput+0x4b/0x4e
   [<c0226d1e>] bfs_create+0x163/0x187
   [<c0188b42>] vfs_create+0xa6/0x114
   [<c018adb5>] do_filp_open+0x1ad/0x62f
   [<c0107cdc>] ? native_sched_clock+0x82/0x96
   [<c06ac309>] ? _spin_unlock+0x27/0x3c
   [<c019379e>] ? alloc_fd+0xbf/0xc9
   [<c06ae2f4>] ? sub_preempt_count+0x9d/0xab
   [<c019379e>] ? alloc_fd+0xbf/0xc9
   [<c0180391>] do_sys_open+0x42/0xb8
   [<c041d564>] ? trace_hardirqs_on_thunk+0xc/0x10
   [<c0180449>] sys_open+0x1e/0x26
   [<c01038bd>] sysenter_do_call+0x12/0x31
   =======================

The problem is that we don't unlock the bfs->lock mutex before calling
iput (we do in the other cases).

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 87ee5ccee34..ed8feb052df 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 							inode->i_ino);
 	if (err) {
 		inode_dec_link_count(inode);
-		iput(inode);
 		mutex_unlock(&info->bfs_lock);
+		iput(inode);
 		return err;
 	}
 	mutex_unlock(&info->bfs_lock);
-- 
cgit v1.2.3


From d7a3e4959c28bccc25dd33315809ffcf40f7493e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 13 Sep 2008 02:33:13 -0700
Subject: mm: ifdef Quicklists in /proc/meminfo

A "Quicklists:          0 kB" line has just started appearing in
/proc/meminfo, but most architectures (including x86) don't have
them configured, so #ifdef it, like the highmem lines.

And those architectures which do have quicklists configured are
using them for page tables: so let's place it next to PageTables.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 00f10a2dcf1..29e20c6b1f7 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -183,6 +183,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"SReclaimable: %8lu kB\n"
 		"SUnreclaim:   %8lu kB\n"
 		"PageTables:   %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+		"Quicklists:   %8lu kB\n"
+#endif
 		"NFS_Unstable: %8lu kB\n"
 		"Bounce:       %8lu kB\n"
 		"WritebackTmp: %8lu kB\n"
@@ -190,8 +193,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
 		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n"
-		"Quicklists:   %8lu kB\n",
+		"VmallocChunk: %8lu kB\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
@@ -216,6 +218,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
 		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+		K(quicklist_total_size()),
+#endif
 		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(global_page_state(NR_BOUNCE)),
 		K(global_page_state(NR_WRITEBACK_TEMP)),
@@ -223,8 +228,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
-		vmi.largest_chunk >> 10,
-		K(quicklist_total_size())
+		vmi.largest_chunk >> 10
 		);
 
 		len += hugetlb_report_meminfo(page + len);
-- 
cgit v1.2.3


From 8d99f83b9478768d3a8d7d1bcd9bd182c75a0447 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 13 Sep 2008 02:33:25 -0700
Subject: rescan_partitions(): make device capacity errors non-fatal

Herton Krzesinski reports that the error-checking changes in
04ebd4aee52b06a2c38127d9208546e5b96f3a19 ("block/ioctl.c and
fs/partition/check.c: check value returned by add_partition") cause his
buggy USB camera to no longer mount.  "The camera is an Olympus X-840.
The original issue comes from the camera itself: its format program
creates a partition with an off by one error".

Buggy devices happen.  It is better for the kernel to warn and to proceed
with the mount.

Reported-by: Herton Ronaldo Krzesinski <herton@mandriva.com.br>
Cc: Abdel Benamrouche <draconux@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/check.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7d6b34e201d..ecc3330972e 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -499,9 +499,9 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		if (!size)
 			continue;
 		if (from + size > get_capacity(disk)) {
-			printk(KERN_ERR " %s: p%d exceeds device capacity\n",
+			printk(KERN_WARNING
+				"%s: p%d exceeds device capacity\n",
 				disk->disk_name, p);
-			continue;
 		}
 		res = add_partition(disk, p, from, size, state->parts[p].flags);
 		if (res) {
-- 
cgit v1.2.3


From acb57a3652c614efed26080dad5972c0076166b1 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 11 Sep 2008 15:35:37 -0400
Subject: GFS2: Direct IO write at end of file error

This patch fixes a problem whereby a direct_io write doesn't fall
back to buffered write properly at end of file.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117..ae7126aeb44 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
 	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	if (offset > i_size_read(&ip->i_inode))
+	if (offset >= i_size_read(&ip->i_inode))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From acd2c8aa02f302ed838348052e16ee575c645147 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Mon, 15 Sep 2008 08:54:06 -0500
Subject: GFS2: GFS2 will panic if you misspell any mount options

The gfs2 superblock pointer is NULL after a failed mount. When control
eventually goes to gfs2_kill_sb, we dereference this NULL pointer. This
patch ensures that the gfs2 superblock pointer is not NULL before being
dereferenced in gfs2_kill_sb.

Signed-off-by:   Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a6225cce2cb..ae35f097aa6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1237,14 +1237,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	gfs2_meta_syncfs(sdp);
-	dput(sdp->sd_root_dir);
-	dput(sdp->sd_master_dir);
-	sdp->sd_root_dir = NULL;
-	sdp->sd_master_dir = NULL;
+	if (sdp) {
+		gfs2_meta_syncfs(sdp);
+		dput(sdp->sd_root_dir);
+		dput(sdp->sd_master_dir);
+		sdp->sd_root_dir = NULL;
+		sdp->sd_master_dir = NULL;
+	}
 	shrink_dcache_sb(sb);
 	kill_block_super(sb);
-	gfs2_delete_debugfs_file(sdp);
+	if (sdp)
+		gfs2_delete_debugfs_file(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
-- 
cgit v1.2.3


From 5f0319a79043457d2555f059fac68c1d840ce381 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 16 Sep 2008 14:05:16 -0400
Subject: cifs: clean up variables in cifs_unlink

Change parameters to cifs_unlink to match the ones used in the generic
VFS. Add some local variables to cut down on the amount of struct
dereferencing that needs to be done, and eliminate some unneeded NULL
pointer checks on the parent directory inode. Finally, rename pTcon
to "tcon" to more closely match standard kernel coding style.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h |  2 +-
 fs/cifs/inode.c  | 90 ++++++++++++++++++++++++++------------------------------
 2 files changed, 42 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 135c965c413..f7b4a5cd837 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
 		       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
 				  struct nameidata *);
-extern int cifs_unlink(struct inode *, struct dentry *);
+extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, int);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c548f11010..511c5261679 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -665,40 +665,34 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 	return inode;
 }
 
-int cifs_unlink(struct inode *inode, struct dentry *direntry)
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc = 0;
 	int xid;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
+	struct inode *inode = dentry->d_inode;
 	struct cifsInodeInfo *cifsInode;
+	struct super_block *sb = dir->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
 	FILE_BASIC_INFO *pinfo_buf;
 
-	cFYI(1, ("cifs_unlink, inode = 0x%p", inode));
+	cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
 
 	xid = GetXid();
 
-	if (inode)
-		cifs_sb = CIFS_SB(inode->i_sb);
-	else
-		cifs_sb = CIFS_SB(direntry->d_sb);
-	pTcon = cifs_sb->tcon;
-
-	/* Unlink can be called from rename so we can not grab the sem here
-	   since we deadlock otherwise */
-/*	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
-	full_path = build_path_from_dentry(direntry);
-/*	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
+	/* Unlink can be called from rename so we can not take the
+	 * sb->s_vfs_rename_mutex here */
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
 	}
 
-	if ((pTcon->ses->capabilities & CAP_UNIX) &&
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
 		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
-			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
-		rc = CIFSPOSIXDelFile(xid, pTcon, full_path,
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
 			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 		cFYI(1, ("posix del rc %d", rc));
@@ -706,31 +700,31 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 			goto psx_del_no_retry;
 	}
 
-	rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls,
+	rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 psx_del_no_retry:
 	if (!rc) {
-		if (direntry->d_inode)
-			drop_nlink(direntry->d_inode);
+		if (inode)
+			drop_nlink(inode);
 	} else if (rc == -ENOENT) {
-		d_drop(direntry);
+		d_drop(dentry);
 	} else if (rc == -ETXTBSY) {
 		int oplock = 0;
 		__u16 netfid;
 
-		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE,
+		rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, DELETE,
 				 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE,
 				 &netfid, &oplock, NULL, cifs_sb->local_nls,
 				 cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if (rc == 0) {
-			CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
+			CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL,
 					      cifs_sb->local_nls,
 					      cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			CIFSSMBClose(xid, pTcon, netfid);
-			if (direntry->d_inode)
-				drop_nlink(direntry->d_inode);
+			CIFSSMBClose(xid, tcon, netfid);
+			if (inode)
+				drop_nlink(inode);
 		}
 	} else if (rc == -EACCES) {
 		/* try only if r/o attribute set in local lookup data? */
@@ -738,8 +732,8 @@ psx_del_no_retry:
 		if (pinfo_buf) {
 			/* ATTRS set to normal clears r/o bit */
 			pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
-			if (!(pTcon->ses->flags & CIFS_SES_NT4))
-				rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+			if (!(tcon->ses->flags & CIFS_SES_NT4))
+				rc = CIFSSMBSetPathInfo(xid, tcon, full_path,
 						     pinfo_buf,
 						     cifs_sb->local_nls,
 						     cifs_sb->mnt_cifs_flags &
@@ -750,7 +744,7 @@ psx_del_no_retry:
 			if (rc == -EOPNOTSUPP) {
 				int oplock = 0;
 				__u16 netfid;
-			/*	rc = CIFSSMBSetAttrLegacy(xid, pTcon,
+			/*	rc = CIFSSMBSetAttrLegacy(xid, tcon,
 							  full_path,
 							  (__u16)ATTR_NORMAL,
 							  cifs_sb->local_nls);
@@ -761,7 +755,7 @@ psx_del_no_retry:
 
 			/* BB could scan to see if we already have it open
 			   and pass in pid of opener to function */
-				rc = CIFSSMBOpen(xid, pTcon, full_path,
+				rc = CIFSSMBOpen(xid, tcon, full_path,
 						 FILE_OPEN, SYNCHRONIZE |
 						 FILE_WRITE_ATTRIBUTES, 0,
 						 &netfid, &oplock, NULL,
@@ -769,28 +763,28 @@ psx_del_no_retry:
 						 cifs_sb->mnt_cifs_flags &
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
 				if (rc == 0) {
-					rc = CIFSSMBSetFileInfo(xid, pTcon,
+					rc = CIFSSMBSetFileInfo(xid, tcon,
 								pinfo_buf,
 								netfid,
 								current->tgid);
-					CIFSSMBClose(xid, pTcon, netfid);
+					CIFSSMBClose(xid, tcon, netfid);
 				}
 			}
 			kfree(pinfo_buf);
 		}
 		if (rc == 0) {
-			rc = CIFSSMBDelFile(xid, pTcon, full_path,
+			rc = CIFSSMBDelFile(xid, tcon, full_path,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (!rc) {
-				if (direntry->d_inode)
-					drop_nlink(direntry->d_inode);
+				if (inode)
+					drop_nlink(inode);
 			} else if (rc == -ETXTBSY) {
 				int oplock = 0;
 				__u16 netfid;
 
-				rc = CIFSSMBOpen(xid, pTcon, full_path,
+				rc = CIFSSMBOpen(xid, tcon, full_path,
 						 FILE_OPEN, DELETE,
 						 CREATE_NOT_DIR |
 						 CREATE_DELETE_ON_CLOSE,
@@ -799,30 +793,28 @@ psx_del_no_retry:
 						 cifs_sb->mnt_cifs_flags &
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
 				if (rc == 0) {
-					CIFSSMBRenameOpenFile(xid, pTcon,
+					CIFSSMBRenameOpenFile(xid, tcon,
 						netfid, NULL,
 						cifs_sb->local_nls,
 						cifs_sb->mnt_cifs_flags &
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-					CIFSSMBClose(xid, pTcon, netfid);
-					if (direntry->d_inode)
-						drop_nlink(direntry->d_inode);
+					CIFSSMBClose(xid, tcon, netfid);
+					if (inode)
+						drop_nlink(inode);
 				}
 			/* BB if rc = -ETXTBUSY goto the rename logic BB */
 			}
 		}
 	}
-	if (direntry->d_inode) {
-		cifsInode = CIFS_I(direntry->d_inode);
-		cifsInode->time = 0;	/* will force revalidate to get info
-					   when needed */
-		direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
-	}
 	if (inode) {
-		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 		cifsInode = CIFS_I(inode);
-		cifsInode->time = 0;	/* force revalidate of dir as well */
+		cifsInode->time = 0;	/* will force revalidate to get info
+					   when needed */
+		inode->i_ctime = current_fs_time(sb);
 	}
+	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+	cifsInode = CIFS_I(dir);
+	cifsInode->time = 0;	/* force revalidate of dir as well */
 
 	kfree(full_path);
 	FreeXid(xid);
-- 
cgit v1.2.3


From 388e57b2759672a3e3ede0d2f7e95124b417b0a3 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 16 Sep 2008 23:50:58 +0000
Subject: [CIFS] use common code for turning off ATTR_READONLY in cifs_unlink

We already have a cifs_set_file_info function that can flip DOS
attribute bits. Have cifs_unlink call it to handle turning ATTR_HIDDEN
on and ATTR_READONLY off when an unlink attempt returns -EACCES.

This also removes a level of indentation from cifs_unlink.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 306 +++++++++++++++++++++++++-------------------------------
 1 file changed, 139 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 511c5261679..8dbc7c90309 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -665,6 +665,101 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 	return inode;
 }
 
+static int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
+		    char *full_path, __u32 dosattr)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid;
+	__u32 netpid;
+	bool set_time = false;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	FILE_BASIC_INFO	info_buf;
+
+	if (attrs->ia_valid & ATTR_ATIME) {
+		set_time = true;
+		info_buf.LastAccessTime =
+			cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+	} else
+		info_buf.LastAccessTime = 0;
+
+	if (attrs->ia_valid & ATTR_MTIME) {
+		set_time = true;
+		info_buf.LastWriteTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+	} else
+		info_buf.LastWriteTime = 0;
+
+	/*
+	 * Samba throws this field away, but windows may actually use it.
+	 * Do not set ctime unless other time stamps are changed explicitly
+	 * (i.e. by utimes()) since we would then have a mix of client and
+	 * server times.
+	 */
+	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+		cFYI(1, ("CIFS - CTIME changed"));
+		info_buf.ChangeTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+	} else
+		info_buf.ChangeTime = 0;
+
+	info_buf.CreationTime = 0;	/* don't change */
+	info_buf.Attributes = cpu_to_le32(dosattr);
+
+	/*
+	 * If the file is already open for write, just use that fileid
+	 */
+	open_file = find_writable_file(cifsInode);
+	if (open_file) {
+		netfid = open_file->netfid;
+		netpid = open_file->pid;
+		goto set_via_filehandle;
+	}
+
+	/*
+	 * NT4 apparently returns success on this call, but it doesn't
+	 * really work.
+	 */
+	if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
+		rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+				     &info_buf, cifs_sb->local_nls,
+				     cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc != -EOPNOTSUPP && rc != -EINVAL)
+			goto out;
+	}
+
+	cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+		 "times not supported by this server"));
+	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
+			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+			 CREATE_NOT_DIR, &netfid, &oplock,
+			 NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc != 0) {
+		if (rc == -EIO)
+			rc = -EINVAL;
+		goto out;
+	}
+
+	netpid = current->tgid;
+
+set_via_filehandle:
+	rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+	if (open_file == NULL)
+		CIFSSMBClose(xid, pTcon, netfid);
+	else
+		atomic_dec(&open_file->wrtPending);
+out:
+	return rc;
+}
+
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc = 0;
@@ -675,7 +770,8 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *tcon = cifs_sb->tcon;
-	FILE_BASIC_INFO *pinfo_buf;
+	struct iattr *attrs;
+	__u32 dosattr;
 
 	cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
 
@@ -728,84 +824,55 @@ psx_del_no_retry:
 		}
 	} else if (rc == -EACCES) {
 		/* try only if r/o attribute set in local lookup data? */
-		pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
-		if (pinfo_buf) {
-			/* ATTRS set to normal clears r/o bit */
-			pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
-			if (!(tcon->ses->flags & CIFS_SES_NT4))
-				rc = CIFSSMBSetPathInfo(xid, tcon, full_path,
-						     pinfo_buf,
-						     cifs_sb->local_nls,
-						     cifs_sb->mnt_cifs_flags &
-							CIFS_MOUNT_MAP_SPECIAL_CHR);
-			else
-				rc = -EOPNOTSUPP;
-
-			if (rc == -EOPNOTSUPP) {
-				int oplock = 0;
-				__u16 netfid;
-			/*	rc = CIFSSMBSetAttrLegacy(xid, tcon,
-							  full_path,
-							  (__u16)ATTR_NORMAL,
-							  cifs_sb->local_nls);
-			   For some strange reason it seems that NT4 eats the
-			   old setattr call without actually setting the
-			   attributes so on to the third attempted workaround
-			   */
-
-			/* BB could scan to see if we already have it open
-			   and pass in pid of opener to function */
-				rc = CIFSSMBOpen(xid, tcon, full_path,
-						 FILE_OPEN, SYNCHRONIZE |
-						 FILE_WRITE_ATTRIBUTES, 0,
-						 &netfid, &oplock, NULL,
-						 cifs_sb->local_nls,
-						 cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc == 0) {
-					rc = CIFSSMBSetFileInfo(xid, tcon,
-								pinfo_buf,
-								netfid,
-								current->tgid);
-					CIFSSMBClose(xid, tcon, netfid);
-				}
-			}
-			kfree(pinfo_buf);
+		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+		if (attrs == NULL) {
+			rc = -ENOMEM;
+			goto out_reval;
 		}
+
+		/* try to reset dos attributes */
+		cifsInode = CIFS_I(inode);
+		dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+		if (dosattr == 0)
+			dosattr |= ATTR_NORMAL;
+		dosattr |= ATTR_HIDDEN;
+
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		kfree(attrs);
+		if (rc != 0)
+			goto out_reval;
+		rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
+				    cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if (rc == 0) {
-			rc = CIFSSMBDelFile(xid, tcon, full_path,
-					    cifs_sb->local_nls,
-					    cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (!rc) {
+			if (inode)
+				drop_nlink(inode);
+		} else if (rc == -ETXTBSY) {
+			int oplock = 0;
+			__u16 netfid;
+
+			rc = CIFSSMBOpen(xid, tcon, full_path,
+					 FILE_OPEN, DELETE,
+					 CREATE_NOT_DIR |
+					 CREATE_DELETE_ON_CLOSE,
+					 &netfid, &oplock, NULL,
+					 cifs_sb->local_nls,
+					 cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+			if (rc == 0) {
+				CIFSSMBRenameOpenFile(xid, tcon,
+					netfid, NULL,
+					cifs_sb->local_nls,
+					cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+				CIFSSMBClose(xid, tcon, netfid);
 				if (inode)
 					drop_nlink(inode);
-			} else if (rc == -ETXTBSY) {
-				int oplock = 0;
-				__u16 netfid;
-
-				rc = CIFSSMBOpen(xid, tcon, full_path,
-						 FILE_OPEN, DELETE,
-						 CREATE_NOT_DIR |
-						 CREATE_DELETE_ON_CLOSE,
-						 &netfid, &oplock, NULL,
-						 cifs_sb->local_nls,
-						 cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc == 0) {
-					CIFSSMBRenameOpenFile(xid, tcon,
-						netfid, NULL,
-						cifs_sb->local_nls,
-						cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-					CIFSSMBClose(xid, tcon, netfid);
-					if (inode)
-						drop_nlink(inode);
-				}
-			/* BB if rc = -ETXTBUSY goto the rename logic BB */
 			}
+		/* BB if rc = -ETXTBUSY goto the rename logic BB */
 		}
 	}
+out_reval:
 	if (inode) {
 		cifsInode = CIFS_I(inode);
 		cifsInode->time = 0;	/* will force revalidate to get info
@@ -1498,101 +1565,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	return rc;
 }
 
-static int
-cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
-		    char *full_path, __u32 dosattr)
-{
-	int rc;
-	int oplock = 0;
-	__u16 netfid;
-	__u32 netpid;
-	bool set_time = false;
-	struct cifsFileInfo *open_file;
-	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
-	FILE_BASIC_INFO	info_buf;
-
-	if (attrs->ia_valid & ATTR_ATIME) {
-		set_time = true;
-		info_buf.LastAccessTime =
-			cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
-	} else
-		info_buf.LastAccessTime = 0;
-
-	if (attrs->ia_valid & ATTR_MTIME) {
-		set_time = true;
-		info_buf.LastWriteTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
-	} else
-		info_buf.LastWriteTime = 0;
-
-	/*
-	 * Samba throws this field away, but windows may actually use it.
-	 * Do not set ctime unless other time stamps are changed explicitly
-	 * (i.e. by utimes()) since we would then have a mix of client and
-	 * server times.
-	 */
-	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		cFYI(1, ("CIFS - CTIME changed"));
-		info_buf.ChangeTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
-	} else
-		info_buf.ChangeTime = 0;
-
-	info_buf.CreationTime = 0;	/* don't change */
-	info_buf.Attributes = cpu_to_le32(dosattr);
-
-	/*
-	 * If the file is already open for write, just use that fileid
-	 */
-	open_file = find_writable_file(cifsInode);
-	if (open_file) {
-		netfid = open_file->netfid;
-		netpid = open_file->pid;
-		goto set_via_filehandle;
-	}
-
-	/*
-	 * NT4 apparently returns success on this call, but it doesn't
-	 * really work.
-	 */
-	if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
-		rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
-				     &info_buf, cifs_sb->local_nls,
-				     cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != -EOPNOTSUPP && rc != -EINVAL)
-			goto out;
-	}
-
-	cFYI(1, ("calling SetFileInfo since SetPathInfo for "
-		 "times not supported by this server"));
-	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
-			 CREATE_NOT_DIR, &netfid, &oplock,
-			 NULL, cifs_sb->local_nls,
-			 cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-
-	if (rc != 0) {
-		if (rc == -EIO)
-			rc = -EINVAL;
-		goto out;
-	}
-
-	netpid = current->tgid;
-
-set_via_filehandle:
-	rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
-	if (open_file == NULL)
-		CIFSSMBClose(xid, pTcon, netfid);
-	else
-		atomic_dec(&open_file->wrtPending);
-out:
-	return rc;
-}
-
 static int
 cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 {
-- 
cgit v1.2.3


From 31bd61f2bb79e098117d823e054342b03aa87668 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Wed, 17 Sep 2008 16:45:37 +1000
Subject: [XFS] Move memory allocations for log tracing out of the critical
 path

Memory allocations for log->l_grant_trace and iclog->ic_trace are done on
demand when the first event is logged. In xlog_state_get_iclog_space() we
call xlog_trace_iclog() under a spinlock and allocating memory here can
cause us to sleep with a spinlock held and deadlock the system.

For the log grant tracing we use KM_NOSLEEP but that means we can lose
trace entries. Since there is no locking to serialize the log grant
tracing we could race and have multiple allocations and leak memory.

So move the allocations to where we initialize the log/iclog structures.
Use KM_NOFS to avoid recursing into the filesystem and drop log->l_trace
since it's not even used.

SGI-PV: 983738

SGI-Modid: xfs-linux-melb:xfs-kern:31896a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/xfs_log.c      | 60 ++++++++++++++++++++++++++++++++++-----------------
 fs/xfs/xfs_log_priv.h |  1 -
 2 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ccba14eb9db..7812bd036e2 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 STATIC int	xlog_iclogs_empty(xlog_t *log);
 
 #if defined(XFS_LOG_TRACE)
+
+#define XLOG_TRACE_LOGGRANT_SIZE	2048
+#define XLOG_TRACE_ICLOG_SIZE		256
+
+void
+xlog_trace_loggrant_alloc(xlog_t *log)
+{
+	log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
+}
+
+void
+xlog_trace_loggrant_dealloc(xlog_t *log)
+{
+	ktrace_free(log->l_grant_trace);
+}
+
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
 	unsigned long cnts;
 
-	if (!log->l_grant_trace) {
-		log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
-		if (!log->l_grant_trace)
-			return;
-	}
 	/* ticket counts are 1 byte each */
 	cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
 
@@ -156,11 +167,21 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 		     (void *)((unsigned long)tic->t_unit_res));
 }
 
+void
+xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
+{
+	iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
+}
+
+void
+xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
+{
+	ktrace_free(iclog->ic_trace);
+}
+
 void
 xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 {
-	if (!iclog->ic_trace)
-		iclog->ic_trace = ktrace_alloc(256, KM_NOFS);
 	ktrace_enter(iclog->ic_trace,
 		     (void *)((unsigned long)state),
 		     (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 		     (void *)NULL, (void *)NULL);
 }
 #else
+
+#define	xlog_trace_loggrant_alloc(log)
+#define	xlog_trace_loggrant_dealloc(log)
 #define	xlog_trace_loggrant(log,tic,string)
+
+#define	xlog_trace_iclog_alloc(iclog)
+#define	xlog_trace_iclog_dealloc(iclog)
 #define	xlog_trace_iclog(iclog,state)
+
 #endif /* XFS_LOG_TRACE */
 
 
@@ -1231,6 +1259,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	spin_lock_init(&log->l_grant_lock);
 	sv_init(&log->l_flush_wait, 0, "flush_wait");
 
+	xlog_trace_loggrant_alloc(log);
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
 
@@ -1285,6 +1314,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
 		sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
 
+		xlog_trace_iclog_alloc(iclog);
+
 		iclogp = &iclog->ic_next;
 	}
 	*iclogp = log->l_iclog;			/* complete ring */
@@ -1565,11 +1596,7 @@ xlog_dealloc_log(xlog_t *log)
 		sv_destroy(&iclog->ic_force_wait);
 		sv_destroy(&iclog->ic_write_wait);
 		xfs_buf_free(iclog->ic_bp);
-#ifdef XFS_LOG_TRACE
-		if (iclog->ic_trace != NULL) {
-			ktrace_free(iclog->ic_trace);
-		}
-#endif
+		xlog_trace_iclog_dealloc(iclog);
 		next_iclog = iclog->ic_next;
 		kmem_free(iclog);
 		iclog = next_iclog;
@@ -1578,14 +1605,7 @@ xlog_dealloc_log(xlog_t *log)
 	spinlock_destroy(&log->l_grant_lock);
 
 	xfs_buf_free(log->l_xbuf);
-#ifdef XFS_LOG_TRACE
-	if (log->l_trace != NULL) {
-		ktrace_free(log->l_trace);
-	}
-	if (log->l_grant_trace != NULL) {
-		ktrace_free(log->l_grant_trace);
-	}
-#endif
+	xlog_trace_loggrant_dealloc(log);
 	log->l_mp->m_log = NULL;
 	kmem_free(log);
 }	/* xlog_dealloc_log */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c8a5b22ee3e..e7d8f84443f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -448,7 +448,6 @@ typedef struct log {
 	int			l_grant_write_bytes;
 
 #ifdef XFS_LOG_TRACE
-	struct ktrace		*l_trace;
 	struct ktrace		*l_grant_trace;
 #endif
 
-- 
cgit v1.2.3


From 6efdf281777eb07fac28ac2b2d7df1e619ee6da1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 17 Sep 2008 16:49:33 +1000
Subject: [XFS] Fix regression introduced by remount fixup

Logically we would return an error in xfs_fs_remount code to prevent users
from believing they might have changed mount options using remount which
can't be changed.

But unfortunately mount(8) adds all options from mtab and fstab to the
mount arguments in some cases so we can't blindly reject options, but have
to check for each specified option if it actually differs from the
currently set option and only reject it if that's the case.

Until that is implemented we return success for every remount request, and
silently ignore all options that we can't actually change.

SGI-PV: 985710

SGI-Modid: xfs-linux-melb:xfs-kern:31908a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 73c65f19e54..18d3c848783 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1302,9 +1302,29 @@ xfs_fs_remount(
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 			break;
 		default:
+			/*
+			 * Logically we would return an error here to prevent
+			 * users from believing they might have changed
+			 * mount options using remount which can't be changed.
+			 *
+			 * But unfortunately mount(8) adds all options from
+			 * mtab and fstab to the mount arguments in some cases
+			 * so we can't blindly reject options, but have to
+			 * check for each specified option if it actually
+			 * differs from the currently set option and only
+			 * reject it if that's the case.
+			 *
+			 * Until that is implemented we return success for
+			 * every remount request, and silently ignore all
+			 * options that we can't actually change.
+			 */
+#if 0
 			printk(KERN_INFO
 	"XFS: mount option \"%s\" not supported for remount\n", p);
 			return -EINVAL;
+#else
+			return 0;
+#endif
 		}
 	}
 
-- 
cgit v1.2.3


From 364f358a734ddcd827c662ccbfa58ee3ac510762 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Wed, 17 Sep 2008 16:50:14 +1000
Subject: [XFS] Prevent direct I/O from mapping extents beyond eof

With the help from some tracing I found that we try to map extents beyond
eof when doing a direct I/O read. It appears that the way to inform the
generic direct I/O path (ie do_direct_IO()) that we have breached eof is
to return an unmapped buffer from xfs_get_blocks_direct(). This will cause
do_direct_IO() to jump to the hole handling code where is will check for
eof and then abort.

This problem was found because a direct I/O read was trying to map beyond
eof and was encountering delayed allocations. The delayed allocations
beyond eof are speculative allocations and they didn't get converted when
the direct I/O flushed the file because there was only enough space in the
current AG to convert and write out the dirty pages within eof. Note that
xfs_iomap_write_allocate() wont necessarily convert all the delayed
allocation passed to it - it will return after allocating the first extent
- so if the delayed allocation extends beyond eof then it will stay that
way.

SGI-PV: 983683

SGI-Modid: xfs-linux-melb:xfs-kern:31929a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/linux-2.6/xfs_aops.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index f42f80a3b1f..a44d68eb50b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1338,6 +1338,10 @@ __xfs_get_blocks(
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
+
+	if (!create && direct && offset >= i_size_read(inode))
+		return 0;
+
 	error = xfs_iomap(XFS_I(inode), offset, size,
 			     create ? flags : BMAPI_READ, &iomap, &niomap);
 	if (error)
-- 
cgit v1.2.3


From b5b8c9acd547244eb2b7d0280ba38b9dd01971cc Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Wed, 17 Sep 2008 16:50:50 +1000
Subject: [XFS] Fix barrier status change detection.

The current code in xlog_iodone() uses the wrong macro to check if the
barrier has been cleared due to an EOPNOTSUPP error form the lower layer.

SGI-PV: 986143

SGI-Modid: xfs-linux-melb:xfs-kern:31984a

Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Nathaniel W. Turner <nate@houseofnate.net>
Signed-off-by: Peter Leckie <pleckie@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7812bd036e2..503ea89e8b9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1037,7 +1037,7 @@ xlog_iodone(xfs_buf_t *bp)
 	 * layer, it means the underlyin device no longer supports
 	 * barrier I/O. Warn loudly and turn off barriers.
 	 */
-	if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) {
+	if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) {
 		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		xfs_fs_cmn_err(CE_WARN, l->l_mp,
 				"xlog_iodone: Barriers are no longer supported"
-- 
cgit v1.2.3


From f9114eba1eb08ee75fd0f1eee780f0290fb3c043 Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Wed, 17 Sep 2008 16:51:21 +1000
Subject: [XFS] Prevent lockdep false positives when locking two inodes.

If we call xfs_lock_two_inodes() to grab both the iolock and the ilock,
then drop the ilocks on both inodes, then grab them again (as
xfs_swap_extents() does) then lockdep will report a locking order problem.
This is a false positive.

To avoid this, disallow xfs_lock_two_inodes() fom locking both inode locks
at once - force calers to make two separate calls. This means that nested
dropping and regaining of the ilocks will retain the same lockdep subclass
and so lockdep will not see anything wrong with this code.

SGI-PV: 986238

SGI-Modid: xfs-linux-melb:xfs-kern:31999a

Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Peter Leckie <pleckie@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_dfrag.c    | 9 ++++++++-
 fs/xfs/xfs_vnodeops.c | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 760f4c5b516..75b0cd4da0e 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -149,7 +149,14 @@ xfs_swap_extents(
 
 	sbp = &sxp->sx_stat;
 
-	xfs_lock_two_inodes(ip, tip, lock_flags);
+	/*
+	 * we have to do two separate lock calls here to keep lockdep
+	 * happy. If we try to get all the locks in one call, lock will
+	 * report false positives when we drop the ILOCK and regain them
+	 * below.
+	 */
+	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 	locked = 1;
 
 	/* Verify that both files have the same format */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index aa238c8fbd7..98a0aecafdd 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1838,6 +1838,12 @@ again:
 #endif
 }
 
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
 void
 xfs_lock_two_inodes(
 	xfs_inode_t		*ip0,
@@ -1848,6 +1854,8 @@ xfs_lock_two_inodes(
 	int			attempts = 0;
 	xfs_log_item_t		*lp;
 
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
 	ASSERT(ip0->i_ino != ip1->i_ino);
 
 	if (ip0->i_ino > ip1->i_ino) {
-- 
cgit v1.2.3


From e1f5dbd7077eebec794452a516cb02f1669b036d Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Wed, 17 Sep 2008 16:52:13 +1000
Subject: [XFS] Fix use-after-free with buffers

We have a use-after-free issue where log completions access buffers via
the buffer log item and the buffer has already been freed. Fix this by
taking a reference on the buffer when attaching the buffer log item and
release the hold when the buffer log item is detached and we no longer
need the buffer. Also create a new function xfs_buf_item_free() to combine
some common code.

SGI-PV: 985757

SGI-Modid: xfs-linux-melb:xfs-kern:32025a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/xfs_buf_item.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 608c30c3f76..002fc2617c8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,6 +732,7 @@ xfs_buf_item_init(
 	bip->bli_item.li_ops = &xfs_buf_item_ops;
 	bip->bli_item.li_mountp = mp;
 	bip->bli_buf = bp;
+	xfs_buf_hold(bp);
 	bip->bli_format.blf_type = XFS_LI_BUF;
 	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
 	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
 	return (bip->bli_flags & XFS_BLI_DIRTY);
 }
 
+STATIC void
+xfs_buf_item_free(
+	xfs_buf_log_item_t	*bip)
+{
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig);
+	kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+	ktrace_free(bip->bli_trace);
+#endif
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
 /*
  * This is called when the buf log item is no longer needed.  It should
  * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
 	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
 		XFS_BUF_CLR_IODONE_FUNC(bp);
 	}
-
-#ifdef XFS_TRANS_DEBUG
-	kmem_free(bip->bli_orig);
-	bip->bli_orig = NULL;
-	kmem_free(bip->bli_logged);
-	bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
-	ktrace_free(bip->bli_trace);
-#endif
-	kmem_zone_free(xfs_buf_item_zone, bip);
+	xfs_buf_rele(bp);
+	xfs_buf_item_free(bip);
 }
 
 
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
 
 	ASSERT(bip->bli_buf == bp);
 
+	xfs_buf_rele(bp);
 	mp = bip->bli_item.li_mountp;
 
 	/*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
 	 * xfs_trans_delete_ail() drops the AIL lock.
 	 */
 	xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
-
-#ifdef XFS_TRANS_DEBUG
-	kmem_free(bip->bli_orig);
-	bip->bli_orig = NULL;
-	kmem_free(bip->bli_logged);
-	bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
-	ktrace_free(bip->bli_trace);
-#endif
-	kmem_zone_free(xfs_buf_item_zone, bip);
+	xfs_buf_item_free(bip);
 }
 
 #if defined(XFS_BLI_TRACE)
-- 
cgit v1.2.3


From 2fd6f6ec64ff347447d26646ac6188f3658b383c Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Wed, 17 Sep 2008 16:52:50 +1000
Subject: [XFS] Don't do I/O beyond eof when unreserving space

When unreserving space with boundaries that are not block aligned we round
up the start and round down the end boundaries and then use this function,
xfs_zero_remaining_bytes(), to zero the parts of the blocks that got
dropped during the rounding. The problem is we don't consider if these
blocks are beyond eof. Worse still is if we encounter delayed allocations
beyond eof we will try to use the magic delayed allocation block number as
a real block number. If the file size is ever extended to expose these
blocks then we'll go through xfs_zero_eof() to zero them anyway.

SGI-PV: 983683

SGI-Modid: xfs-linux-melb:xfs-kern:32055a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/xfs_vnodeops.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 98a0aecafdd..8b6812f66a1 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3160,6 +3160,13 @@ error1:	/* Just cancel transaction */
 /*
  * Zero file bytes between startoff and endoff inclusive.
  * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
  */
 STATIC int
 xfs_zero_remaining_bytes(
@@ -3176,6 +3183,17 @@ xfs_zero_remaining_bytes(
 	int			nimap;
 	int			error = 0;
 
+	/*
+	 * Avoid doing I/O beyond eof - it's not necessary
+	 * since nothing can read beyond eof.  The space will
+	 * be zeroed when the file is extended anyway.
+	 */
+	if (startoff >= ip->i_size)
+		return 0;
+
+	if (endoff > ip->i_size)
+		endoff = ip->i_size;
+
 	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
 				XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp);
-- 
cgit v1.2.3


From 0855f310dff76ae42c5aac32f600f8f692bbd23f Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Sep 2008 11:17:29 +0200
Subject: UBIFS: create the name of the background thread in every case

If the ubifs partition is mounted RO and then remounted RW we end
up with no thread name in ubifs_remount_rw() and the thread appears
nameless.

Signed-off-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7562464ac83..3f4902060c7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1024,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_dereg;
 	}
 
+	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
 	if (!mounted_read_only) {
 		err = alloc_wbufs(c);
 		if (err)
 			goto out_cbuf;
 
 		/* Create background thread */
-		sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
-			c->vi.vol_id);
 		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
 		if (!c->bgt)
 			c->bgt = ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From 6dcfac4f13d6b32fbaa60b64a23249999e66af8e Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 12 Sep 2008 12:27:47 +0300
Subject: UBIFS: TNC / GC race fixes

- update GC sequence number if any nodes may have been moved
even if GC did not finish the LEB
- don't ignore error return when reading

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c  | 14 +++++++++++---
 fs/ubifs/tnc.c |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 13f1019c859..02aba36fe3d 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -334,15 +334,15 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 
 		err = move_nodes(c, sleb);
 		if (err)
-			goto out;
+			goto out_inc_seq;
 
 		err = gc_sync_wbufs(c);
 		if (err)
-			goto out;
+			goto out_inc_seq;
 
 		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
 		if (err)
-			goto out;
+			goto out_inc_seq;
 
 		/* Allow for races with TNC */
 		c->gced_lnum = lnum;
@@ -369,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 out:
 	ubifs_scan_destroy(sleb);
 	return err;
+
+out_inc_seq:
+	/* We may have moved at least some nodes so allow for races with TNC */
+	c->gced_lnum = lnum;
+	smp_wmb();
+	c->gc_seq += 1;
+	smp_wmb();
+	goto out;
 }
 
 /**
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 7da209ab937..7634c597088 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1476,7 +1476,7 @@ again:
 	}
 
 	err = fallible_read_node(c, key, &zbr, node);
-	if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+	if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
 		/*
 		 * The node may have been GC'ed out from under us so try again
 		 * while keeping the TNC mutex locked.
-- 
cgit v1.2.3


From 6e14968c869cd30e236bb626dd0c16421d2a658d Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Wed, 17 Sep 2008 10:52:07 +0300
Subject: UBIFS: remove incorrect assert

The assert was not valid because one of the variables
'taken_empty_lebs' has transient values out of sync
with the other variables.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/find.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index e045c8b5542..47814cde240 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -507,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
-	ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
 	if (rsvd_idx_lebs < lebs)
 		/*
 		 * OK to allocate an empty LEB, but we still don't want to go
-- 
cgit v1.2.3


From 7424bac82ff3bd956ea04101550e01bdae17284d Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Wed, 17 Sep 2008 22:09:41 +0400
Subject: UBIFS: fix printk format warnings

fs/ubifs/dir.c:428: warning: format '%llu' expects type 'long long
unsigned int', but argument 5 has type 'long unsigned int'

fs/ubifs/debug.c:541: warning: format '%llu' expects type 'long long
unsigned int', but argument 2 has type 'long unsigned int'

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 2 +-
 fs/ubifs/dir.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b9cb7747375..d7f7645779f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
 		for (i = 0; i < n; i++)
 			printk(KERN_DEBUG "\t  ino %llu\n",
-			       le64_to_cpu(orph->inos[i]));
+			       (unsigned long long)le64_to_cpu(orph->inos[i]));
 		break;
 	}
 	default:
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 2b267c9a180..526c01ec800 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -426,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 
 	while (1) {
 		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
-			dent->name, le64_to_cpu(dent->inum),
+			dent->name, (unsigned long long)le64_to_cpu(dent->inum),
 			key_hash_flash(c, &dent->key));
 		ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
 
-- 
cgit v1.2.3


From 37ec89e83c4ca98323fe74f139301ff3949cfdb6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 18 Sep 2008 13:49:32 +0100
Subject: GFS2: The war on bloat

The following patch shrinks the gfs2_args structure which is embedded in
every GFS2 superblock. It cuts down the size of the options to a single
unsigned int (the 13 bits of bitfields will be rounded up to that size
by the compiler) from the current 11 unsigned ints. So on x86 thats 44
bytes shrinking to 4 bytes, in each and every GFS2 superblock.

Signed-off-by: Steven Whitehouse <swhitho@redhat.com>
---
 fs/gfs2/incore.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a1777a1927b..b2c5784092c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,21 +386,21 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_ORDERED	2
 
 struct gfs2_args {
-	char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
-	char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
-	char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
-	int ar_spectator; /* Don't get a journal because we're always RO */
-	int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
-	int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
-	int ar_localcaching; /* Local-style caching (dangerous on multihost) */
-	int ar_debug; /* Oops on errors instead of trying to be graceful */
-	int ar_upgrade; /* Upgrade ondisk/multihost format */
-	unsigned int ar_num_glockd; /* Number of glockd threads */
-	int ar_posix_acl; /* Enable posix acls */
-	int ar_quota; /* off/account/on */
-	int ar_suiddir; /* suiddir support */
-	int ar_data; /* ordered/writeback */
-	int ar_meta; /* mount metafs */
+	char ar_lockproto[GFS2_LOCKNAME_LEN];	/* Name of the Lock Protocol */
+	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
+	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
+	unsigned int ar_spectator:1;		/* Don't get a journal */
+	unsigned int ar_ignore_local_fs:1;	/* Ignore optimisations */
+	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
+	unsigned int ar_localcaching:1;		/* Local caching */
+	unsigned int ar_debug:1;		/* Oops on errors */
+	unsigned int ar_upgrade:1;		/* Upgrade ondisk format */
+	unsigned int ar_posix_acl:1;		/* Enable posix acls */
+	unsigned int ar_quota:2;		/* off/account/on */
+	unsigned int ar_suiddir:1;		/* suiddir support */
+	unsigned int ar_data:2;			/* ordered/writeback */
+	unsigned int ar_meta:1;			/* mount metafs */
+	unsigned int ar_num_glockd;		/* Number of glockd threads */
 };
 
 struct gfs2_tune {
-- 
cgit v1.2.3


From 719ee344675c2efed9115934f19aa66a526b6e5b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 18 Sep 2008 13:53:59 +0100
Subject: GFS2: high time to take some time over atime

Until now, we've used the same scheme as GFS1 for atime. This has failed
since atime is a per vfsmnt flag, not a per fs flag and as such the
"noatime" flag was not getting passed down to the filesystems. This
patch removes all the "special casing" around atime updates and we
simply use the VFS's atime code.

The net result is that GFS2 will now support all the same atime related
mount options of any other filesystem on a per-vfsmnt basis. We do lose
the "lazy atime" updates, but we gain "relatime". We could add lazy
atime to the VFS at a later date, if there is a requirement for that
variant still - I suspect relatime will be enough.

Also we lose about 100 lines of code after this patch has been applied,
and I have a suspicion that it will speed things up a bit, even when
atime is "on". So it seems like a nice clean up as well.

From a user perspective, everything stays the same except the loss of
the per-fs atime quantum tweekable (ought to be per-vfsmnt at the very
least, and to be honest I don't think anybody ever used it) and that a
number of options which were ignored before now work correctly.

Please let me know if you've got any comments. I'm pushing this out
early so that you can all see what my plans are.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c       |   2 -
 fs/gfs2/glock.h       |   1 -
 fs/gfs2/incore.h      |   2 -
 fs/gfs2/inode.c       | 107 ++++----------------------------------------------
 fs/gfs2/inode.h       |   1 -
 fs/gfs2/ops_address.c |  16 ++++----
 fs/gfs2/ops_file.c    |  16 ++++----
 fs/gfs2/ops_fstype.c  |  23 +++--------
 fs/gfs2/ops_super.c   |  53 +++++++++++++++++--------
 fs/gfs2/sys.c         |  11 ------
 10 files changed, 65 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 806e1eb0aa0..c962283d4e7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1580,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 		*p++ = 'a';
 	if (flags & GL_EXACT)
 		*p++ = 'E';
-	if (flags & GL_ATIME)
-		*p++ = 'a';
 	if (flags & GL_NOCACHE)
 		*p++ = 'c';
 	if (test_bit(HIF_HOLDER, &iflags))
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70f..695c6b19361 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
 #define GL_SKIP			0x00000100
-#define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
 
 #define GLR_TRYFAILED		13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b2c5784092c..f1ed3a1bf8a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -420,7 +420,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_scale_den; /* Denominator */
 	unsigned int gt_quota_cache_secs;
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
-	unsigned int gt_atime_quantum; /* Min secs between atime updates */
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
 	unsigned int gt_stall_secs; /* Detects trouble! */
@@ -433,7 +432,6 @@ enum {
 	SDF_JOURNAL_CHECKED	= 0,
 	SDF_JOURNAL_LIVE	= 1,
 	SDF_SHUTDOWN		= 2,
-	SDF_NOATIME		= 3,
 };
 
 #define GFS2_FSNAME_LEN		256
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c8a959c09f1..7cee695fa44 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/security.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
+	struct timespec atime;
 	u16 height, depth;
 
 	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_size = be64_to_cpu(str->di_size);
 	i_size_write(&ip->i_inode, di->di_size);
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-	ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
-	ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	atime.tv_sec = be64_to_cpu(str->di_atime);
+	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+		ip->i_inode.i_atime = atime;
 	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
 	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
 	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -1157,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 	unsigned int x;
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
-	error = gfs2_glock_nq_atime(&i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
 		return error;
@@ -1193,101 +1197,6 @@ out:
 	return error;
 }
 
-/**
- * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
- *       conditionally update the inode's atime
- * @gh: the holder to acquire
- *
- * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
- * Update if the difference between the current time and the inode's current
- * atime is greater than an interval specified at mount.
- *
- * Returns: errno
- */
-
-int gfs2_glock_nq_atime(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct gfs2_inode *ip = gl->gl_object;
-	s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
-	unsigned int state;
-	int flags;
-	int error;
-	struct timespec tv = CURRENT_TIME;
-
-	if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
-	    gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
-	    gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
-		return -EINVAL;
-
-	state = gh->gh_state;
-	flags = gh->gh_flags;
-
-	error = gfs2_glock_nq(gh);
-	if (error)
-		return error;
-
-	if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
-	    (sdp->sd_vfs->s_flags & MS_RDONLY))
-		return 0;
-
-	if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-		gfs2_glock_dq(gh);
-		gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
-				   gh);
-		error = gfs2_glock_nq(gh);
-		if (error)
-			return error;
-
-		/* Verify that atime hasn't been updated while we were
-		   trying to get exclusive lock. */
-
-		tv = CURRENT_TIME;
-		if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-			struct buffer_head *dibh;
-			struct gfs2_dinode *di;
-
-			error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-			if (error == -EROFS)
-				return 0;
-			if (error)
-				goto fail;
-
-			error = gfs2_meta_inode_buffer(ip, &dibh);
-			if (error)
-				goto fail_end_trans;
-
-			ip->i_inode.i_atime = tv;
-
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			di = (struct gfs2_dinode *)dibh->b_data;
-			di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
-			di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
-			brelse(dibh);
-
-			gfs2_trans_end(sdp);
-		}
-
-		/* If someone else has asked for the glock,
-		   unlock and let them have it. Then reacquire
-		   in the original state. */
-		if (gfs2_glock_is_blocking(gl)) {
-			gfs2_glock_dq(gh);
-			gfs2_holder_reinit(state, flags, gh);
-			return gfs2_glock_nq(gh);
-		}
-	}
-
-	return 0;
-
-fail_end_trans:
-	gfs2_trans_end(sdp);
-fail:
-	gfs2_glock_dq(gh);
-	return error;
-}
-
 static int
 __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index bfd2afc0c90..2d43f69610a 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -92,7 +92,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   const struct gfs2_inode *ip);
 int gfs2_permission(struct inode *inode, int mask);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_glock_nq_atime(struct gfs2_holder *gh);
 int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ae7126aeb44..27563816e1c 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
 	int error;
 
 	unlock_page(page);
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	error = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
 	if (unlikely(error))
 		goto out;
 	error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 	struct gfs2_holder gh;
 	int ret;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	ret = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
 	if (unlikely(ret))
 		goto out_uninit;
 	if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	unsigned to = from + len;
 	struct page *page;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
-	error = gfs2_glock_nq_atime(&ip->i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
 	if (unlikely(error))
 		goto out_uninit;
 
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	 * unfortunately have the option of only flushing a range like
 	 * the VFS does.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
-	rv = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	rv = gfs2_glock_nq(&gh);
 	if (rv)
 		return rv;
 	rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411..3a747f8e218 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
 	u64 offset = file->f_pos;
 	int error;
 
-	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
-	error = gfs2_glock_nq_atime(&d_gh);
+	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	error = gfs2_glock_nq(&d_gh);
 	if (error) {
 		gfs2_holder_uninit(&d_gh);
 		return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	int error;
 	u32 fsflags;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	error = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
 	if (error)
 		return error;
 
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	struct gfs2_alloc *al;
 	int ret;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
-	ret = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out;
 
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	struct gfs2_holder i_gh;
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
-	error = gfs2_glock_nq_atime(&i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
 		return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ae35f097aa6..b117fcf2c4f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -70,7 +70,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_scale_den = 1;
 	gt->gt_quota_cache_secs = 300;
 	gt->gt_quota_quantum = 60;
-	gt->gt_atime_quantum = 3600;
 	gt->gt_new_files_jdata = 0;
 	gt->gt_max_readahead = 1 << 18;
 	gt->gt_stall_secs = 600;
@@ -135,22 +134,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	return sdp;
 }
 
-static void init_vfs(struct super_block *sb, unsigned noatime)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-
-	sb->s_magic = GFS2_MAGIC;
-	sb->s_op = &gfs2_super_ops;
-	sb->s_export_op = &gfs2_export_ops;
-	sb->s_time_gran = 1;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-
-	if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
-		set_bit(noatime, &sdp->sd_flags);
-
-	/* Don't let the VFS update atimes.  GFS2 handles this itself. */
-	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
-}
 
 /**
  * gfs2_check_sb - Check superblock
@@ -1100,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 	}
 
-	init_vfs(sb, SDF_NOATIME);
+	sb->s_magic = GFS2_MAGIC;
+	sb->s_op = &gfs2_super_ops;
+	sb->s_export_op = &gfs2_export_ops;
+	sb->s_time_gran = 1;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
 	/* Set up the buffer cache and fill in some fake block size values
 	   to allow us to read-in the on-disk superblock. */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 8f332d26b5d..d5355d9b592 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -38,6 +39,7 @@
 #include "dir.h"
 #include "eattr.h"
 #include "bmap.h"
+#include "meta_io.h"
 
 /**
  * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,41 @@
 static int gfs2_write_inode(struct inode *inode, int sync)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-
-	/* Check this is a "normal" inode */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		if (current->flags & PF_MEMALLOC)
-			return 0;
-		if (sync)
-			gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct timespec atime;
+	struct gfs2_dinode *di;
+	int ret = 0;
+
+	/* Check this is a "normal" inode, etc */
+	if (!test_bit(GIF_USER, &ip->i_flags) ||
+	    (current->flags & PF_MEMALLOC))
+		return 0;
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (ret)
+		goto do_flush;
+	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (ret)
+		goto do_unlock;
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		di = (struct gfs2_dinode *)bh->b_data;
+		atime.tv_sec = be64_to_cpu(di->di_atime);
+		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+		if (timespec_compare(&inode->i_atime, &atime) > 0) {
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_dinode_out(ip, bh->b_data);
+		}
+		brelse(bh);
 	}
-
-	return 0;
+	gfs2_trans_end(sdp);
+do_unlock:
+	gfs2_glock_dq_uninit(&gh);
+do_flush:
+	if (sync != 0)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	return ret;
 }
 
 /**
@@ -297,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		}
 	}
 
-	if (*flags & (MS_NOATIME | MS_NODIRATIME))
-		set_bit(SDF_NOATIME, &sdp->sd_flags);
-	else
-		clear_bit(SDF_NOATIME, &sdp->sd_flags);
-
-	/* Don't let the VFS update atimes.  GFS2 handles this itself. */
-	*flags |= MS_NOATIME | MS_NODIRATIME;
-
 	return error;
 }
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3..7e1879f1a02 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
 ARGS_ATTR(data,            "%d\n");
 
-/* one oddball doesn't fit the macro mold */
-static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%d\n",
-			!!test_bit(SDF_NOATIME, &sdp->sd_flags));
-}
-static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
-
 static struct attribute *args_attrs[] = {
 	&args_attr_lockproto.attr,
 	&args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
 	&args_attr_quota.attr,
 	&args_attr_suiddir.attr,
 	&args_attr_data.attr,
-	&args_attr_noatime.attr,
 	NULL,
 };
 
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
-TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_log_flush_secs.attr,
 	&tune_attr_quota_warn_period.attr,
 	&tune_attr_quota_quantum.attr,
-	&tune_attr_atime_quantum.attr,
 	&tune_attr_max_readahead.attr,
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
-- 
cgit v1.2.3


From 914258bf2cb22bf4336a1b1d90c551b4b11ca5aa Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 6 Oct 2008 21:35:40 -0400
Subject: ext4/jbd2: Avoid WARN() messages when failing to write to the
 superblock

This fixes some very common warnings reported by kerneloops.org

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c   | 23 ++++++++++++++++++++++-
 fs/jbd2/journal.c | 27 +++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8175318abd8..fb5766e2bff 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2810,13 +2810,34 @@ static void ext4_commit_super(struct super_block *sb,
 
 	if (!sbh)
 		return;
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "ext4: previous I/O error to "
+		       "superblock detected for %s.\n", sb->s_id);
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
 	es->s_wtime = cpu_to_le32(get_seconds());
 	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-	if (sync)
+	if (sync) {
 		sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			printk(KERN_ERR "ext4: I/O error while writing "
+			       "superblock for %s.\n", sb->s_id);
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
 }
 
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81186a29742..01c3901c3a0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1255,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 		goto out;
 	}
 
+	if (buffer_write_io_error(bh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD2: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal->j_devname);
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
 	spin_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1266,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
-	if (wait)
+	if (wait) {
 		sync_dirty_buffer(bh);
-	else
+		if (buffer_write_io_error(bh)) {
+			printk(KERN_ERR "JBD2: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal->j_devname);
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
 		ll_rw_block(SWRITE, 1, &bh);
 
 out:
-- 
cgit v1.2.3


From f702ba0fd7d50b5f5f5aea5317875a10d40b869f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 22 Sep 2008 15:21:01 -0400
Subject: ext4: Don't use 'struct dentry' for internal lookups

This is a port of a patch from Linus which fixes a 200+ byte stack
usage problem in ext4_get_parent().

It's more efficient to pass down only the actual parts of the dentry
that matter: the parent inode and the name, instead of allocating a
struct dentry on the stack.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 68 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a1f72d217c7..5295a9225cf 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -159,7 +159,7 @@ static void dx_set_count(struct dx_entry *entries, unsigned value);
 static void dx_set_limit(struct dx_entry *entries, unsigned value);
 static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
 static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
@@ -177,8 +177,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
-		       struct ext4_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+		const struct qstr *d_name,
+		struct ext4_dir_entry_2 **res_dir,
+		int *err);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
@@ -345,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
 	unsigned count, indirect;
@@ -356,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (dentry)
-		dir = dentry->d_parent->d_inode;
 	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
 		goto fail;
 	root = (struct dx_root *) bh->b_data;
@@ -373,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	}
 	hinfo->hash_version = root->info.hash_version;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-	if (dentry)
-		ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	if (d_name)
+		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
@@ -649,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	hinfo.hash = start_hash;
 	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 
@@ -805,15 +805,15 @@ static inline int ext4_match (int len, const char * const name,
  */
 static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
-				  struct dentry *dentry,
+				  const struct qstr *d_name,
 				  unsigned long offset,
 				  struct ext4_dir_entry_2 ** res_dir)
 {
 	struct ext4_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = d_name->name;
+	int namelen = d_name->len;
 
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -852,7 +852,8 @@ static inline int search_dirblock(struct buffer_head *bh,
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+					const struct qstr *d_name,
 					struct ext4_dir_entry_2 ** res_dir)
 {
 	struct super_block *sb;
@@ -866,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 	int num = 0;
 	ext4_lblk_t  nblocks;
 	int i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = dentry->d_name.len;
+	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 	if (is_dx(dir)) {
-		bh = ext4_dx_find_entry(dentry, res_dir, &err);
+		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -928,7 +928,7 @@ restart:
 			brelse(bh);
 			goto next;
 		}
-		i = search_dirblock(bh, dir, dentry,
+		i = search_dirblock(bh, dir, d_name,
 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT4_I(dir)->i_dir_start_lookup = block;
@@ -962,7 +962,7 @@ cleanup_and_exit:
 	return ret;
 }
 
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 		       struct ext4_dir_entry_2 **res_dir, int *err)
 {
 	struct super_block * sb;
@@ -973,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
-	int namelen = dentry->d_name.len;
-	const u8 *name = dentry->d_name.name;
-	struct inode *dir = dentry->d_parent->d_inode;
+	int namelen = d_name->len;
+	const u8 *name = d_name->name;
 
 	sb = dir->i_sb;
 	/* NFS may look up ".." - look at dx_root directory block */
 	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+		if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
 			return NULL;
 	} else {
 		frame = frames;
@@ -1041,7 +1040,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext4_find_entry(dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
 		unsigned long ino = le32_to_cpu(de->inode);
@@ -1064,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	unsigned long ino;
 	struct dentry *parent;
 	struct inode *inode;
-	struct dentry dotdot;
+	static const struct qstr dotdot = {
+		.name = "..",
+		.len = 2,
+	};
 	struct ext4_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_parent = child; /* confusing, isn't it! */
-
-	bh = ext4_find_entry(&dotdot, &de);
+	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
 	inode = NULL;
 	if (!bh)
 		return ERR_PTR(-ENOENT);
@@ -1508,7 +1506,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct ext4_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 	entries = frame->entries;
@@ -2089,7 +2087,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry(dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2151,7 +2149,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		handle->h_sync = 1;
 
 	retval = -ENOENT;
-	bh = ext4_find_entry(dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2312,7 +2310,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		handle->h_sync = 1;
 
-	old_bh = ext4_find_entry(old_dentry, &old_de);
+	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -2325,7 +2323,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext4_find_entry(new_dentry, &new_de);
+	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
 			brelse(new_bh);
@@ -2392,7 +2390,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct buffer_head *old_bh2;
 		struct ext4_dir_entry_2 *old_de2;
 
-		old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
 		if (old_bh2) {
 			retval = ext4_delete_entry(handle, old_dir,
 						   old_de2, old_bh2);
-- 
cgit v1.2.3


From 232087cb734c7035c0a8947fb05d3e8092ff6c4d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 15 Sep 2008 13:22:54 +0300
Subject: cifs: don't use GFP_KERNEL with GFP_NOFS

GFP_KERNEL and GFP_NOFS are mutually exclusive. If you combine them, you end up
with plain GFP_KERNEL which can deadlock in cases where you really want
GFP_NOFS.

Cc: Steve French <sfrench@samba.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c      | 6 ++----
 fs/cifs/transport.c | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe315..654d972a88f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
    but it may be more efficient to always alloc same size
    albeit slightly larger than necessary and maxbuffersize
    defaults to this and can not be bigger */
-	ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp,
-						   GFP_KERNEL | GFP_NOFS);
+	ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
 
 	/* clear the first few header bytes */
 	/* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
    but it may be more efficient to always alloc same size
    albeit slightly larger than necessary and maxbuffersize
    defaults to this and can not be bigger */
-	ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp,
-						   GFP_KERNEL | GFP_NOFS);
+	ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
 	if (ret_buf) {
 	/* No need to clear memory here, cleared in header assemble */
 	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e286db9f5ee..bf0e6d8e382 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
 		return NULL;
 	}
 
-	temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp,
-						    GFP_KERNEL | GFP_NOFS);
+	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
 	if (temp == NULL)
 		return temp;
 	else {
-- 
cgit v1.2.3


From 2846d3864738dd6e290755d0692cf377e09ba79f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 22 Sep 2008 21:33:33 -0400
Subject: cifs: have find_writeable_file prefer filehandles opened by same task

When the CIFS client goes to write out pages, it needs to pick a
filehandle to write to. find_writeable_file however just picks the
first filehandle that it finds. This can cause problems when a lock
is issued against a particular filehandle and we pick a different
filehandle to write to.

This patch tries to avert this situation by having find_writable_file
prefer filehandles that have a pid that matches the current task.
This seems to fix lock test 11 from the connectathon test suite when
run against a windows server.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cbefe1f1f9f..d39e852a28a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1065,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 {
 	struct cifsFileInfo *open_file;
+	bool any_available = false;
 	int rc;
 
 	/* Having a null inode here (because mapping->host was set to zero by
@@ -1080,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 	read_lock(&GlobalSMBSeslock);
 refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
+		if (open_file->closePend ||
+		    (!any_available && open_file->pid != current->tgid))
 			continue;
+
 		if (open_file->pfile &&
 		    ((open_file->pfile->f_flags & O_RDWR) ||
 		     (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1131,6 +1134,11 @@ refind_writable:
 			   of the loop here. */
 		}
 	}
+	/* couldn't find useable FH with same pid, try any available */
+	if (!any_available) {
+		any_available = true;
+		goto refind_writable;
+	}
 	read_unlock(&GlobalSMBSeslock);
 	return NULL;
 }
-- 
cgit v1.2.3


From 9f6200bbfc962d8f926278cf5d5ddb90a228c322 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 23 Sep 2008 09:18:24 -0400
Subject: ext4: move /proc setup and teardown out of mballoc.c

...and into the core setup/teardown code in fs/ext4/super.c so that
other parts of ext4 can define tuning parameters.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/ext4_sb.h |  2 +-
 fs/ext4/mballoc.c | 76 ++++++++++++++++---------------------------------------
 fs/ext4/mballoc.h |  1 -
 fs/ext4/super.c   | 17 +++++++++++++
 5 files changed, 42 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 50a4846c7e7..b9c9371d816 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -955,6 +955,8 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 			unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
 
+extern struct proc_dir_entry *ext4_proc_root;
+
 /*
  * Function prototypes
  */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index a5577e0ccd3..95e046e77cb 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -61,6 +61,7 @@ struct ext4_sb_info {
 	struct percpu_counter s_dirs_counter;
 	struct percpu_counter s_dirtyblocks_counter;
 	struct blockgroup_lock s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
 
 	/* root of the per fs reservation window tree */
 	spinlock_t s_rsv_window_lock;
@@ -122,7 +123,6 @@ struct ext4_sb_info {
 	int s_mb_history_cur;
 	int s_mb_history_max;
 	int s_mb_history_num;
-	struct proc_dir_entry *s_mb_proc;
 	spinlock_t s_mb_history_lock;
 	int s_mb_history_filter;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 14ebd572bea..842d4017006 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2170,9 +2170,10 @@ static void ext4_mb_history_release(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	remove_proc_entry("mb_groups", sbi->s_mb_proc);
-	remove_proc_entry("mb_history", sbi->s_mb_proc);
-
+	if (sbi->s_proc != NULL) {
+		remove_proc_entry("mb_groups", sbi->s_proc);
+		remove_proc_entry("mb_history", sbi->s_proc);
+	}
 	kfree(sbi->s_mb_history);
 }
 
@@ -2181,10 +2182,10 @@ static void ext4_mb_history_init(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int i;
 
-	if (sbi->s_mb_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc,
+	if (sbi->s_proc != NULL) {
+		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_history_fops, sb);
-		proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc,
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}
 
@@ -2720,8 +2721,6 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 #define EXT4_MB_STREAM_REQ		"stream_req"
 #define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
 
-
-
 #define MB_PROC_FOPS(name)					\
 static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
 {								\
@@ -2771,7 +2770,7 @@ MB_PROC_FOPS(group_prealloc);
 
 #define	MB_PROC_HANDLER(name, var)					\
 do {									\
-	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
+	proc = proc_create_data(name, mode, sbi->s_proc,		\
 				&ext4_mb_##var##_proc_fops, sbi);	\
 	if (proc == NULL) {						\
 		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
@@ -2784,20 +2783,9 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct proc_dir_entry *proc;
-	char devname[BDEVNAME_SIZE], *p;
 
-	if (proc_root_ext4 == NULL) {
-		sbi->s_mb_proc = NULL;
+	if (sbi->s_proc == NULL)
 		return -EINVAL;
-	}
-	bdevname(sb->s_bdev, devname);
-	p = devname;
-	while ((p = strchr(p, '/')))
-		*p = '!';
-
-	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
-	if (!sbi->s_mb_proc)
-		goto err_create_dir;
 
 	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
 	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
@@ -2805,43 +2793,31 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
 	MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
 	MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
-
 	return 0;
 
 err_out:
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
-	sbi->s_mb_proc = NULL;
-err_create_dir:
-	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
-
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 	return -ENOMEM;
 }
 
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char devname[BDEVNAME_SIZE], *p;
 
-	if (sbi->s_mb_proc == NULL)
+	if (sbi->s_proc == NULL)
 		return -EINVAL;
 
-	bdevname(sb->s_bdev, devname);
-	p = devname;
-	while ((p = strchr(p, '/')))
-		*p = '!';
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 
 	return 0;
 }
@@ -2863,11 +2839,6 @@ int __init init_ext4_mballoc(void)
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 	}
-#ifdef CONFIG_PROC_FS
-	proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
-	if (proc_root_ext4 == NULL)
-		printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
-#endif
 	return 0;
 }
 
@@ -2876,9 +2847,6 @@ void exit_ext4_mballoc(void)
 	/* XXX: synchronize_rcu(); */
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("fs/ext4", NULL);
-#endif
 }
 
 
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906c2a7..b3b4828f8b8 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
-static struct proc_dir_entry *proc_root_ext4;
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb5766e2bff..7feeec6f7c3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -45,6 +46,8 @@
 #include "namei.h"
 #include "group.h"
 
+struct proc_dir_entry *ext4_proc_root;
+
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -512,6 +515,8 @@ static void ext4_put_super(struct super_block *sb)
 		mark_buffer_dirty(sbi->s_sbh);
 		ext4_commit_super(sb, es, 1);
 	}
+	if (sbi->s_proc)
+		remove_proc_entry(sb->s_id, ext4_proc_root);
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -1916,6 +1921,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
+	char *cp;
 	int ret = -EINVAL;
 	int blocksize;
 	int db_count;
@@ -1936,6 +1942,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	unlock_kernel();
 
+	/* Cleanup superblock name */
+	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+		*cp = '!';
+
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -2221,6 +2231,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -2500,6 +2513,8 @@ failed_mount2:
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 failed_mount:
+	if (sbi->s_proc)
+		remove_proc_entry(sb->s_id, ext4_proc_root);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -3538,6 +3553,7 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
 		return err;
@@ -3567,6 +3583,7 @@ static void __exit exit_ext4_fs(void)
 	destroy_inodecache();
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
+	remove_proc_entry("fs/ext4", NULL);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
cgit v1.2.3


From a12a1ac7a474b3680b9cce9f64a4f78123aecf37 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 23 Sep 2008 11:48:35 -0400
Subject: cifs: move rename and delete-on-close logic into helper function

cifs: move rename and delete-on-close logic into helper function

When a file is still open on the server, we attempt to set the
DELETE_ON_CLOSE bit and rename it to a new filename. When the
last opener closes the file, the server should delete it.

This patch moves this mechanism into a helper function and has
the two places in cifs_unlink that do this procedure call it. It
also fixes the open flags to be correct.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 98 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8dbc7c90309..660aac81160 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -760,6 +760,59 @@ out:
 	return rc;
 }
 
+/*
+ * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * and rename it to a random name that hopefully won't conflict with
+ * anything else.
+ */
+static int
+cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
+{
+	int oplock = 0;
+	int rc;
+	__u16 netfid;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	__u32 dosattr;
+	FILE_BASIC_INFO *info_buf;
+
+	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+			 DELETE|FILE_WRITE_ATTRIBUTES,
+			 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
+			 &netfid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out;
+
+	/* set ATTR_HIDDEN and clear ATTR_READONLY */
+	cifsInode = CIFS_I(inode);
+	dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+	if (dosattr == 0)
+		dosattr |= ATTR_NORMAL;
+	dosattr |= ATTR_HIDDEN;
+
+	info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+	if (info_buf == NULL) {
+		rc = -ENOMEM;
+		goto out_close;
+	}
+	info_buf->Attributes = cpu_to_le32(dosattr);
+	rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
+	kfree(info_buf);
+	if (rc != 0)
+		goto out_close;
+
+	/* silly-rename the file */
+	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+out_close:
+	CIFSSMBClose(xid, tcon, netfid);
+out:
+	return rc;
+}
+
 int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc = 0;
@@ -805,23 +858,9 @@ psx_del_no_retry:
 	} else if (rc == -ENOENT) {
 		d_drop(dentry);
 	} else if (rc == -ETXTBSY) {
-		int oplock = 0;
-		__u16 netfid;
-
-		rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, DELETE,
-				 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE,
-				 &netfid, &oplock, NULL, cifs_sb->local_nls,
-				 cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL,
-					      cifs_sb->local_nls,
-					      cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			CIFSSMBClose(xid, tcon, netfid);
-			if (inode)
-				drop_nlink(inode);
-		}
+		rc = cifs_rename_pending_delete(full_path, inode, xid);
+		if (rc == 0)
+			drop_nlink(inode);
 	} else if (rc == -EACCES) {
 		/* try only if r/o attribute set in local lookup data? */
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
@@ -848,28 +887,9 @@ psx_del_no_retry:
 			if (inode)
 				drop_nlink(inode);
 		} else if (rc == -ETXTBSY) {
-			int oplock = 0;
-			__u16 netfid;
-
-			rc = CIFSSMBOpen(xid, tcon, full_path,
-					 FILE_OPEN, DELETE,
-					 CREATE_NOT_DIR |
-					 CREATE_DELETE_ON_CLOSE,
-					 &netfid, &oplock, NULL,
-					 cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-					    CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (rc == 0) {
-				CIFSSMBRenameOpenFile(xid, tcon,
-					netfid, NULL,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-					    CIFS_MOUNT_MAP_SPECIAL_CHR);
-				CIFSSMBClose(xid, tcon, netfid);
-				if (inode)
-					drop_nlink(inode);
-			}
-		/* BB if rc = -ETXTBUSY goto the rename logic BB */
+			rc = cifs_rename_pending_delete(full_path, inode, xid);
+			if (rc == 0)
+				drop_nlink(inode);
 		}
 	}
 out_reval:
-- 
cgit v1.2.3


From 7c9c3760b3a5ae87ee4d661703b6d5de3999fe46 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 23 Sep 2008 17:23:09 +0000
Subject: [CIFS] add constants for string lengths of keynames in SPNEGO upcall
 string

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 117ef4bba68..fcee9298b62 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
 	.describe	= user_describe,
 };
 
-#define MAX_VER_STR_LEN   8 /* length of longest version string e.g.
-				strlen("ver=0xFF") */
-#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
-			       in future could have strlen(";sec=ntlmsspi") */
-#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+/* length of longest version string e.g.  strlen("ver=0xFF") */
+#define MAX_VER_STR_LEN		8
+
+/* length of longest security mechanism name, eg in future could have
+ * strlen(";sec=ntlmsspi") */
+#define MAX_MECH_STR_LEN	13
+
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+#define MAX_IPV6_ADDR_LEN	42
+
+/* strlen of "host=" */
+#define HOST_KEY_LEN		5
+
+/* strlen of ";ip4=" or ";ip6=" */
+#define IP_KEY_LEN		5
+
+/* strlen of ";uid=0x" */
+#define UID_KEY_LEN		7
+
+/* strlen of ";user=" */
+#define USER_KEY_LEN		6
+
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -84,11 +101,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
 	   host=hostname sec=mechanism uid=0xFF user=username */
 	desc_len = MAX_VER_STR_LEN +
-		   6 /* len of "host=" */ + strlen(hostname) +
-		   5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN +
+		   HOST_KEY_LEN + strlen(hostname) +
+		   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
 		   MAX_MECH_STR_LEN +
-		   7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) +
-		   6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1;
+		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
 
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
-- 
cgit v1.2.3


From 6d22f09896c0d62c003ffa25fff25323e3ed608b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 23 Sep 2008 11:48:35 -0400
Subject: cifs: add function to set file disposition

cifs: add function to set file disposition

The proper way to set the delete on close bit on an already existing
file is to use SET_FILE_INFO with an infolevel of
SMB_FILE_DISPOSITION_INFO. Add a function to do that and have the
silly-rename code use it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/cifssmb.c   | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/inode.c     |  9 +++++++--
 3 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a729d083e6f..014f26c7864 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -179,6 +179,8 @@ extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 			const FILE_BASIC_INFO *data, __u16 fid,
 			__u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+			bool delete_file, __u16 fid, __u32 pid_of_opener);
 #if 0
 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
 			char *fileName, __u16 dos_attributes,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 994de7c9047..7b365842bfa 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4876,6 +4876,61 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	return rc;
 }
 
+int
+CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+			  bool delete_file, __u16 fid, __u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+
+	count = 1;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
+	pSMB->Reserved4 = 0;
+	pSMB->hdr.smb_buf_length += byte_count;
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	*data_offset = delete_file ? 1 : 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc)
+		cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+
+	return rc;
+}
 
 int
 CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 660aac81160..954b670f168 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -778,8 +778,7 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	FILE_BASIC_INFO *info_buf;
 
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 DELETE|FILE_WRITE_ATTRIBUTES,
-			 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
+			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0)
@@ -807,6 +806,12 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out_close;
+
+	/* set DELETE_ON_CLOSE */
+	rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
+
 out_close:
 	CIFSSMBClose(xid, tcon, netfid);
 out:
-- 
cgit v1.2.3


From ee2fd967fb23c5eecabc8a660ec66fcd79acbd47 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 23 Sep 2008 18:23:33 +0000
Subject: [CIFS]  fix busy-file renames and refactor cifs_rename logic

Break out the code that does the actual renaming into a separate
function and have cifs_rename call that. That function will attempt a
path based rename first and then do a filehandle based one if it looks
like the source is busy.

The existing logic tried a path based rename first, but if we needed to
remove the destination then it only attempted a filehandle based rename
afterward. Not all servers support renaming by filehandle, so we need to
always attempt path rename first and fall back to filehandle rename if
it doesn't work.

This also fixes renames of open files on windows servers (at least when
the source and destination directories are the same).

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 186 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 104 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 954b670f168..82612be9477 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -806,8 +806,6 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc != 0)
-		goto out_close;
 
 	/* set DELETE_ON_CLOSE */
 	rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
@@ -1180,117 +1178,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	return rc;
 }
 
+static int
+cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
+		struct dentry *to_dentry, const char *toPath)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	__u16 srcfid;
+	int oplock, rc;
+
+	/* try path-based rename first */
+	rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
+			   cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	/*
+	 * don't bother with rename by filehandle unless file is busy and
+	 * source Note that cross directory moves do not work with
+	 * rename by filehandle to various Windows servers.
+	 */
+	if (rc == 0 || rc != -ETXTBSY)
+		return rc;
+
+	/* open the file to be renamed -- we need DELETE perms */
+	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
+			 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
+			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc == 0) {
+		rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
+				(const char *) to_dentry->d_name.name,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+		CIFSSMBClose(xid, pTcon, srcfid);
+	}
+
+	return rc;
+}
+
 int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
 	struct inode *target_inode, struct dentry *target_direntry)
 {
-	char *fromName;
-	char *toName;
+	char *fromName = NULL;
+	char *toName = NULL;
 	struct cifs_sb_info *cifs_sb_source;
 	struct cifs_sb_info *cifs_sb_target;
 	struct cifsTconInfo *pTcon;
+	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+	FILE_UNIX_BASIC_INFO *info_buf_target;
 	int xid;
-	int rc = 0;
-
-	xid = GetXid();
+	int rc;
 
 	cifs_sb_target = CIFS_SB(target_inode->i_sb);
 	cifs_sb_source = CIFS_SB(source_inode->i_sb);
 	pTcon = cifs_sb_source->tcon;
 
+	xid = GetXid();
+
+	/*
+	 * BB: this might be allowed if same server, but different share.
+	 * Consider adding support for this
+	 */
 	if (pTcon != cifs_sb_target->tcon) {
-		FreeXid(xid);
-		return -EXDEV;	/* BB actually could be allowed if same server,
-				   but different share.
-				   Might eventually add support for this */
+		rc = -EXDEV;
+		goto cifs_rename_exit;
 	}
 
-	/* we already  have the rename sem so we do not need to grab it again
-	   here to protect the path integrity */
+	/*
+	 * we already have the rename sem so we do not need to
+	 * grab it again here to protect the path integrity
+	 */
 	fromName = build_path_from_dentry(source_direntry);
+	if (fromName == NULL) {
+		rc = -ENOMEM;
+		goto cifs_rename_exit;
+	}
+
 	toName = build_path_from_dentry(target_direntry);
-	if ((fromName == NULL) || (toName == NULL)) {
+	if (toName == NULL) {
 		rc = -ENOMEM;
 		goto cifs_rename_exit;
 	}
 
-	rc = CIFSSMBRename(xid, pTcon, fromName, toName,
-			   cifs_sb_source->local_nls,
-			   cifs_sb_source->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	rc = cifs_do_rename(xid, source_direntry, fromName,
+			    target_direntry, toName);
+
 	if (rc == -EEXIST) {
-		/* check if they are the same file because rename of hardlinked
-		   files is a noop */
-		FILE_UNIX_BASIC_INFO *info_buf_source;
-		FILE_UNIX_BASIC_INFO *info_buf_target;
-
-		info_buf_source =
-			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-		if (info_buf_source != NULL) {
+		if (pTcon->unix_ext) {
+			/*
+			 * Are src and dst hardlinks of same inode? We can
+			 * only tell with unix extensions enabled
+			 */
+			info_buf_source =
+				kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
+						GFP_KERNEL);
+			if (info_buf_source != NULL)
+				goto unlink_target;
+
 			info_buf_target = info_buf_source + 1;
-			if (pTcon->unix_ext)
-				rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
-					info_buf_source,
-					cifs_sb_source->local_nls,
-					cifs_sb_source->mnt_cifs_flags &
+			rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
+						info_buf_source,
+						cifs_sb_source->local_nls,
+						cifs_sb_source->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			/* else rc is still EEXIST so will fall through to
-			   unlink the target and retry rename */
-			if (rc == 0) {
-				rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName,
-						info_buf_target,
+			if (rc != 0)
+				goto unlink_target;
+
+			rc = CIFSSMBUnixQPathInfo(xid, pTcon,
+						toName, info_buf_target,
 						cifs_sb_target->local_nls,
 						/* remap based on source sb */
 						cifs_sb_source->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-			}
-			if ((rc == 0) &&
-			    (info_buf_source->UniqueId ==
-			     info_buf_target->UniqueId)) {
-			/* do not rename since the files are hardlinked which
-			   is a noop */
-			} else {
-			/* we either can not tell the files are hardlinked
-			   (as with Windows servers) or files are not
-			   hardlinked so delete the target manually before
-			   renaming to follow POSIX rather than Windows
-			   semantics */
-				cifs_unlink(target_inode, target_direntry);
-				rc = CIFSSMBRename(xid, pTcon, fromName,
-						   toName,
-						   cifs_sb_source->local_nls,
-						   cifs_sb_source->mnt_cifs_flags
-						   & CIFS_MOUNT_MAP_SPECIAL_CHR);
-			}
-			kfree(info_buf_source);
-		} /* if we can not get memory just leave rc as EEXIST */
-	}
-
-	if (rc)
-		cFYI(1, ("rename rc %d", rc));
-
-	if ((rc == -EIO) || (rc == -EEXIST)) {
-		int oplock = 0;
-		__u16 netfid;
-
-		/* BB FIXME Is Generic Read correct for rename? */
-		/* if renaming directory - we should not say CREATE_NOT_DIR,
-		   need to test renaming open directory, also GENERIC_READ
-		   might not right be right access to request */
-		rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
-				 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-				 cifs_sb_source->local_nls,
-				 cifs_sb_source->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
-					      cifs_sb_source->local_nls,
-					      cifs_sb_source->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			CIFSSMBClose(xid, pTcon, netfid);
-		}
+
+			if (rc == 0 && (info_buf_source->UniqueId ==
+					info_buf_target->UniqueId))
+				/* same file, POSIX says that this is a noop */
+				goto cifs_rename_exit;
+		} /* else ... BB we could add the same check for Windows by
+		     checking the UniqueId via FILE_INTERNAL_INFO */
+unlink_target:
+		/*
+		 * we either can not tell the files are hardlinked (as with
+		 * Windows servers) or files are not hardlinked. Delete the
+		 * target manually before renaming to follow POSIX rather than
+		 * Windows semantics
+		 */
+		cifs_unlink(target_inode, target_direntry);
+		rc = cifs_do_rename(xid, source_direntry, fromName,
+				    target_direntry, toName);
 	}
 
 cifs_rename_exit:
+	kfree(info_buf_source);
 	kfree(fromName);
 	kfree(toName);
 	FreeXid(xid);
-- 
cgit v1.2.3


From 9d81523480c8c5b07a4899a084b3f4264a575184 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 23 Sep 2008 18:46:07 +0000
Subject: [CIFS] clean up upcall handling for dns_resolver keys

We're given the datalen in the downcall, so there's no need to do any
calls to strlen(). Just keep track of the datalen in the key. Finally,
add a sanity check of the data in the downcall to make sure that it
looks like a real IP address.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dns_resolve.c | 74 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index a2e0673e1b0..1e0c1bd8f2e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,19 +29,55 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static int dns_resolver_instantiate(struct key *key, const void *data,
+/* Checks if supplied name is IP address
+ * returns:
+ * 		1 - name is IP
+ * 		0 - name is not IP
+ */
+static int
+is_ip(const char *name)
+{
+	int rc;
+	struct sockaddr_in sin_server;
+	struct sockaddr_in6 sin_server6;
+
+	rc = cifs_inet_pton(AF_INET, name,
+			&sin_server.sin_addr.s_addr);
+
+	if (rc <= 0) {
+		/* not ipv4 address, try ipv6 */
+		rc = cifs_inet_pton(AF_INET6, name,
+				&sin_server6.sin6_addr.in6_u);
+		if (rc > 0)
+			return 1;
+	} else {
+		return 1;
+	}
+	/* we failed translating address */
+	return 0;
+}
+
+static int
+dns_resolver_instantiate(struct key *key, const void *data,
 		size_t datalen)
 {
 	int rc = 0;
 	char *ip;
 
-	ip = kmalloc(datalen+1, GFP_KERNEL);
+	ip = kmalloc(datalen + 1, GFP_KERNEL);
 	if (!ip)
 		return -ENOMEM;
 
 	memcpy(ip, data, datalen);
 	ip[datalen] = '\0';
 
+	/* make sure this looks like an address */
+	if (!is_ip((const char *) ip)) {
+		kfree(ip);
+		return -EINVAL;
+	}
+
+	key->type_data.x[0] = datalen;
 	rcu_assign_pointer(key->payload.data, ip);
 
 	return rc;
@@ -62,33 +98,6 @@ struct key_type key_type_dns_resolver = {
 	.match       = user_match,
 };
 
-/* Checks if supplied name is IP address
- * returns:
- * 		1 - name is IP
- * 		0 - name is not IP
- */
-static int is_ip(const char *name)
-{
-	int rc;
-	struct sockaddr_in sin_server;
-	struct sockaddr_in6 sin_server6;
-
-	rc = cifs_inet_pton(AF_INET, name,
-			&sin_server.sin_addr.s_addr);
-
-	if (rc <= 0) {
-		/* not ipv4 address, try ipv6 */
-		rc = cifs_inet_pton(AF_INET6, name,
-				&sin_server6.sin6_addr.in6_u);
-		if (rc > 0)
-			return 1;
-	} else {
-		return 1;
-	}
-	/* we failed translating address */
-	return 0;
-}
-
 /* Resolves server name to ip address.
  * input:
  * 	unc - server UNC
@@ -140,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 
 	rkey = request_key(&key_type_dns_resolver, name, "");
 	if (!IS_ERR(rkey)) {
+		len = rkey->type_data.x[0];
 		data = rkey->payload.data;
 	} else {
 		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -148,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 
 skip_upcall:
 	if (data) {
-		len = strlen(data);
-		*ip_addr = kmalloc(len+1, GFP_KERNEL);
+		*ip_addr = kmalloc(len + 1, GFP_KERNEL);
 		if (*ip_addr) {
-			memcpy(*ip_addr, data, len);
-			(*ip_addr)[len] = '\0';
+			memcpy(*ip_addr, data, len + 1);
 			if (!IS_ERR(rkey))
 				cFYI(1, ("%s: resolved: %s to %s", __func__,
 							name,
-- 
cgit v1.2.3


From 5e8814f2f74f53d58aa5679bf32b38a7940033fe Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 23 Sep 2008 18:07:35 -0400
Subject: ext4: Combine proc file handling into a single set of functions

Previously mballoc created a separate set of functions for each proc
file.  This combines the tunables into a single set of functions which
gets used for all of the per-superblock proc files, saving
approximately 2k of compiled object code.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 16 +++++++++++++
 fs/ext4/ext4_sb.h | 12 +++++-----
 fs/ext4/mballoc.c | 69 +++++--------------------------------------------------
 fs/ext4/super.c   | 42 +++++++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b9c9371d816..163c44527dd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -957,6 +957,22 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations ext4_ui_proc_fops;
+
+#define	EXT4_PROC_HANDLER(name, var)					\
+do {									\
+	proc = proc_create_data(name, mode, sbi->s_proc,		\
+				&ext4_ui_proc_fops, &sbi->s_##var);	\
+	if (proc == NULL) {						\
+		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
+		goto err_out;						\
+	}								\
+} while (0)
+#else
+#define EXT4_PROC_HANDLER(name, var)
+#endif
+
 /*
  * Function prototypes
  */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 95e046e77cb..f92af01138d 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -108,12 +108,12 @@ struct ext4_sb_info {
 
 	/* tunables */
 	unsigned long s_stripe;
-	unsigned long s_mb_stream_request;
-	unsigned long s_mb_max_to_scan;
-	unsigned long s_mb_min_to_scan;
-	unsigned long s_mb_stats;
-	unsigned long s_mb_order2_reqs;
-	unsigned long s_mb_group_prealloc;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 842d4017006..b9118bb2993 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2721,63 +2721,6 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 #define EXT4_MB_STREAM_REQ		"stream_req"
 #define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
 
-#define MB_PROC_FOPS(name)					\
-static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
-{								\
-	struct ext4_sb_info *sbi = m->private;			\
-								\
-	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
-	return 0;						\
-}								\
-								\
-static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-{								\
-	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
-}								\
-								\
-static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
-		const char __user *buf, size_t cnt, loff_t *ppos)	\
-{								\
-	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
-	char str[32];						\
-	long value;						\
-	if (cnt >= sizeof(str))					\
-		return -EINVAL;					\
-	if (copy_from_user(str, buf, cnt))			\
-		return -EFAULT;					\
-	value = simple_strtol(str, NULL, 0);			\
-	if (value <= 0)						\
-		return -ERANGE;					\
-	sbi->s_mb_##name = value;				\
-	return cnt;						\
-}								\
-								\
-static const struct file_operations ext4_mb_##name##_proc_fops = {	\
-	.owner		= THIS_MODULE,				\
-	.open		= ext4_mb_##name##_proc_open,		\
-	.read		= seq_read,				\
-	.llseek		= seq_lseek,				\
-	.release	= single_release,			\
-	.write		= ext4_mb_##name##_proc_write,		\
-};
-
-MB_PROC_FOPS(stats);
-MB_PROC_FOPS(max_to_scan);
-MB_PROC_FOPS(min_to_scan);
-MB_PROC_FOPS(order2_reqs);
-MB_PROC_FOPS(stream_request);
-MB_PROC_FOPS(group_prealloc);
-
-#define	MB_PROC_HANDLER(name, var)					\
-do {									\
-	proc = proc_create_data(name, mode, sbi->s_proc,		\
-				&ext4_mb_##var##_proc_fops, sbi);	\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
-		goto err_out;						\
-	}								\
-} while (0)
-
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 {
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
@@ -2787,12 +2730,12 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	if (sbi->s_proc == NULL)
 		return -EINVAL;
 
-	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
-	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
-	MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
-	MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+	EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
+	EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+	EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
+	EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
 	return 0;
 
 err_out:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7feeec6f7c3..9f5468fb06d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3541,6 +3541,48 @@ static int ext4_get_sb(struct file_system_type *fs_type,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
+#ifdef CONFIG_PROC_FS
+static int ext4_ui_proc_show(struct seq_file *m, void *v)
+{
+	unsigned int *p = m->private;
+
+	seq_printf(m, "%u\n", *p);
+	return 0;
+}
+
+static int ext4_ui_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
+}
+
+static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
+			       size_t cnt, loff_t *ppos)
+{
+	unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+	char str[32];
+	unsigned long value;
+
+	if (cnt >= sizeof(str))
+		return -EINVAL;
+	if (copy_from_user(str, buf, cnt))
+		return -EFAULT;
+	value = simple_strtol(str, NULL, 0);
+	if (value < 0)
+		return -ERANGE;
+	*p = value;
+	return cnt;
+}
+
+const struct file_operations ext4_ui_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_ui_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= ext4_ui_proc_write,
+};
+#endif
+
 static struct file_system_type ext4dev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4dev",
-- 
cgit v1.2.3


From 74553b1b6a8556e08757b4bce537fd8332b93898 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 24 Sep 2008 14:55:51 -0400
Subject: cifs: fix inverted NULL check after kmalloc

cifs: fix inverted NULL check after kmalloc

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82612be9477..079f39a8dd3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1274,7 +1274,7 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
 			info_buf_source =
 				kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
 						GFP_KERNEL);
-			if (info_buf_source != NULL)
+			if (info_buf_source == NULL)
 				goto unlink_target;
 
 			info_buf_target = info_buf_source + 1;
-- 
cgit v1.2.3


From 7ce86d5a93ffe2542e6558a97ab055377df8cde3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 24 Sep 2008 11:32:59 -0400
Subject: cifs: work around samba returning -ENOENT on SetFileDisposition call

cifs: work around samba returning -ENOENT on SetFileDisposition call

Samba seems to return STATUS_OBJECT_NAME_NOT_FOUND when we try to set
the delete on close bit after doing a rename by filehandle. This looks
like a samba bug to me, but a lot of servers will do this. For now,
pretend an -ENOENT return is a success.

Samba does however seem to respect the CREATE_DELETE_ON_CLOSE bit
when opening files that already exist. Windows will ignore it, but
so adding it to the open flags should be harmless.

We're also currently ignoring the return code on the rename by
filehandle, so no need to set rc based on it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 079f39a8dd3..27e97d43c75 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -778,7 +778,8 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	FILE_BASIC_INFO *info_buf;
 
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
-			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
+			 DELETE|FILE_WRITE_ATTRIBUTES,
+			 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0)
@@ -803,13 +804,20 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 		goto out_close;
 
 	/* silly-rename the file */
-	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+	CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 	/* set DELETE_ON_CLOSE */
 	rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
 
+	/*
+	 * some samba versions return -ENOENT when we try to set the file
+	 * disposition here. Likely a samba bug, but work around it for now
+	 */
+	if (rc == -ENOENT)
+		rc = 0;
+
 out_close:
 	CIFSSMBClose(xid, tcon, netfid);
 out:
-- 
cgit v1.2.3


From 391e575556109744ae0aa198c1e245588a3ea76a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 24 Sep 2008 11:32:59 -0400
Subject: cifs: remove NULL termination from rename target in
 CIFSSMBRenameOpenFIle

cifs: remove NULL termination from rename target in CIFSSMBRenameOpenFIle

The rename destination isn't supposed to be null terminated. Also,
change the name string arg to be const.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 2 +-
 fs/cifs/cifssmb.c   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 014f26c7864..0cff7fe986e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -231,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-			int netfid, char *target_name,
+			int netfid, const char *target_name,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7b365842bfa..7504d1514c7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2017,7 +2017,7 @@ renameRetry:
 }
 
 int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-		int netfid, char *target_name,
+		int netfid, const char *target_name,
 		const struct nls_table *nls_codepage, int remap)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -2071,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 					remap);
 	}
 	rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
-	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2;
+	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
 	byte_count += count;
 	pSMB->DataCount = cpu_to_le16(count);
 	pSMB->TotalDataCount = pSMB->DataCount;
-- 
cgit v1.2.3


From d388908ec40ada0001dfe05134de31d0cc62907c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 24 Sep 2008 19:22:52 +0000
Subject: [CIFS] update DOS attributes in cifsInode if we successfully changed
 them

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 27e97d43c75..db091c516c2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -752,6 +752,9 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 
 set_via_filehandle:
 	rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+	if (!rc)
+		cifsInode->cifsAttrs = dosattr;
+
 	if (open_file == NULL)
 		CIFSSMBClose(xid, pTcon, netfid);
 	else
@@ -902,6 +905,7 @@ psx_del_no_retry:
 			if (rc == 0)
 				drop_nlink(inode);
 		}
+		cifsInode->cifsAttrs = dosattr;
 	}
 out_reval:
 	if (inode) {
-- 
cgit v1.2.3


From d9414774dc0c7b395036deeca000af42e2d13612 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 24 Sep 2008 11:32:59 -0400
Subject: cifs: Convert cifs to new aops.

cifs: Convert cifs to new aops.

This patch is based on the one originally posted by Nick Piggin. His
patch was very close, but had a couple of small bugs. Nick's original
comments follow:

This is another relatively naive conversion. Always do the read upfront
when the page is not uptodate (unless we're in the writethrough path).

Fix an uninitialized data exposure where SetPageUptodate was called
before the page was uptodate.

SetPageUptodate and switch to writeback mode in the case that the full
page was dirtied.

Acked-by: Shaggy <shaggy@austin.ibm.com>
Acked-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 120 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 59 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d39e852a28a..c4a8a060512 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 
 	/* want handles we can use to read with first
 	   in the list so we do not have to walk the
-	   list to search for one in prepare_write */
+	   list to search for one in write_begin */
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
 		list_add_tail(&pCifsFile->flist,
 			      &pCifsInode->openFileList);
@@ -915,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 }
 
 static ssize_t cifs_write(struct file *file, const char *write_data,
-	size_t write_size, loff_t *poffset)
+			  size_t write_size, loff_t *poffset)
 {
 	int rc = 0;
 	unsigned int bytes_written = 0;
@@ -1455,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 	return rc;
 }
 
-static int cifs_commit_write(struct file *file, struct page *page,
-	unsigned offset, unsigned to)
+static int cifs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
 {
-	int xid;
-	int rc = 0;
-	struct inode *inode = page->mapping->host;
-	loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-	char *page_data;
+	int rc;
+	struct inode *inode = mapping->host;
 
-	xid = GetXid();
-	cFYI(1, ("commit write for page %p up to position %lld for %d",
-		 page, position, to));
-	spin_lock(&inode->i_lock);
-	if (position > inode->i_size)
-		i_size_write(inode, position);
+	cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
+		 page, pos, copied));
+
+	if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+		SetPageUptodate(page);
 
-	spin_unlock(&inode->i_lock);
 	if (!PageUptodate(page)) {
-		position =  ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
-		/* can not rely on (or let) writepage write this data */
-		if (to < offset) {
-			cFYI(1, ("Illegal offsets, can not copy from %d to %d",
-				offset, to));
-			FreeXid(xid);
-			return rc;
-		}
+		char *page_data;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		int xid;
+
+		xid = GetXid();
 		/* this is probably better than directly calling
 		   partialpage_write since in this function the file handle is
 		   known which we might as well	leverage */
 		/* BB check if anything else missing out of ppw
 		   such as updating last write time */
 		page_data = kmap(page);
-		rc = cifs_write(file, page_data + offset, to-offset,
-				&position);
-		if (rc > 0)
-			rc = 0;
-		/* else if (rc < 0) should we set writebehind rc? */
+		rc = cifs_write(file, page_data + offset, copied, &pos);
+		/* if (rc < 0) should we set writebehind rc? */
 		kunmap(page);
+
+		FreeXid(xid);
 	} else {
+		rc = copied;
+		pos += copied;
 		set_page_dirty(page);
 	}
 
-	FreeXid(xid);
+	if (rc > 0) {
+		spin_lock(&inode->i_lock);
+		if (pos > inode->i_size)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
 	return rc;
 }
 
@@ -2043,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
 		return true;
 }
 
-static int cifs_prepare_write(struct file *file, struct page *page,
-	unsigned from, unsigned to)
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
 {
-	int rc = 0;
-	loff_t i_size;
-	loff_t offset;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
+
+	*pagep = __grab_cache_page(mapping, index);
+	if (!*pagep)
+		return -ENOMEM;
 
-	cFYI(1, ("prepare write for page %p from %d to %d", page, from, to));
-	if (PageUptodate(page))
+	if (PageUptodate(*pagep))
 		return 0;
 
 	/* If we are writing a full page it will be up to date,
 	   no need to read from the server */
-	if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
-		SetPageUptodate(page);
+	if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
 		return 0;
-	}
 
-	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	i_size = i_size_read(page->mapping->host);
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		int rc;
 
-	if ((offset >= i_size) ||
-	    ((from == 0) && (offset + to) >= i_size)) {
-		/*
-		 * We don't need to read data beyond the end of the file.
-		 * zero it, and set the page uptodate
-		 */
-		simple_prepare_write(file, page, from, to);
-		SetPageUptodate(page);
-	} else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
 		/* might as well read a page, it is fast enough */
-		rc = cifs_readpage_worker(file, page, &offset);
+		rc = cifs_readpage_worker(file, *pagep, &offset);
+
+		/* we do not need to pass errors back
+		   e.g. if we do not have read access to the file
+		   because cifs_write_end will attempt synchronous writes
+		   -- shaggy */
 	} else {
 		/* we could try using another file handle if there is one -
 		   but how would we lock it to prevent close of that handle
 		   racing with this read? In any case
-		   this will be written out by commit_write so is fine */
+		   this will be written out by write_end so is fine */
 	}
 
-	/* we do not need to pass errors back
-	   e.g. if we do not have read access to the file
-	   because cifs_commit_write will do the right thing.  -- shaggy */
-
 	return 0;
 }
 
@@ -2094,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
 	.readpages = cifs_readpages,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
-	.prepare_write = cifs_prepare_write,
-	.commit_write = cifs_commit_write,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
@@ -2110,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.readpage = cifs_readpage,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
-	.prepare_write = cifs_prepare_write,
-	.commit_write = cifs_commit_write,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
-- 
cgit v1.2.3


From dfd15c46a6c2cafb006183c0c14f07e59eee4ac0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 24 Sep 2008 11:32:59 -0400
Subject: cifs: explicitly revoke SPNEGO key after session setup

cifs: explicitly revoke SPNEGO key after session setup

The SPNEGO blob returned by an upcall can only be used once. Explicitly
revoke it to make sure that we never pick it up again after session
setup exits.

This doesn't seem to be that big an issue on more recent kernels, but
older kernels seem to link keys into the session keyring by default.
That said, explicitly revoking the key seems like a reasonable thing
to do here.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 252fdc0567f..2851d5da0c8 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -624,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 					 ses, nls_cp);
 
 ssetup_exit:
-	if (spnego_key)
+	if (spnego_key) {
+		key_revoke(spnego_key);
 		key_put(spnego_key);
+	}
 	kfree(str_area);
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 		cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
-- 
cgit v1.2.3


From 62aa528e0299ffef8e138d9d92d13e631d06c5ff Mon Sep 17 00:00:00 2001
From: Julien Brunel <brunel@diku.dk>
Date: Wed, 24 Sep 2008 16:22:22 -0500
Subject: 9p: use an IS_ERR test rather than a NULL test

In case of error, the function p9_client_walk returns an ERR pointer, but
never returns a NULL pointer.  So a NULL test that comes after an IS_ERR
test should be deleted.

The semantic match that finds this problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@match_bad_null_test@
expression x, E;
statement S1,S2;
@@
x = p9_client_walk(...)
... when != x = E
*  if (x != NULL)
S1 else S2
// </smpl>

Signed-off-by: Julien Brunel <brunel@diku.dk>
Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/9p/vfs_inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c6504..e83aa5ebe86 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	return NULL;
 
 error:
-	if (fid)
-		p9_client_clunk(fid);
+	p9_client_clunk(fid);
 
 	return ERR_PTR(result);
 }
-- 
cgit v1.2.3


From f1ccd2955157e1aff992f6aaaba0944209076220 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Fri, 26 Sep 2008 12:16:46 +1000
Subject: [XFS] Fix extent list corruption in xfs_iext_irec_compact_full().

If we don't move all the records from the next buffer into the current
buffer then we need to update the er_extoff field of the next buffer as we
shift the remaining records to the start of the buffer.

SGI-PV: 987159

SGI-Modid: xfs-linux-melb:xfs-kern:32165a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Russell Cattelan <cattelan@thebarn.com>
---
 fs/xfs/xfs_inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 00e80df9dd9..419cfc2eacb 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4584,6 +4584,7 @@ xfs_iext_irec_compact_full(
 					(XFS_LINEAR_EXTS -
 						erp_next->er_extcount) *
 					sizeof(xfs_bmbt_rec_t));
+				erp_next->er_extoff += ext_diff;
 			}
 		}
 
-- 
cgit v1.2.3


From 71a8c87fb300b601eacf7a86cc6c6322fe827bfd Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Fri, 26 Sep 2008 12:17:57 +1000
Subject: [XFS] Remove xfs_iext_irec_compact_full()

Yet another bug was found in xfs_iext_irec_compact_full() and while the
source of the bug was found it wasn't an easy task to track it down
because the conditions are very difficult to reproduce.

A HUGE thank-you goes to Russell Cattelan and Eric Sandeen for their
significant effort in tracking down the source of this corruption.

xfs_iext_irec_compact_full() and xfs_iext_irec_compact_pages() are almost
identical - they both compact indirect extent lists by moving extents from
subsequent buffers into earlier ones. xfs_iext_irec_compact_pages() only
moves extents if all of the extents in the next buffer will fit into the
empty space in the buffer before it. xfs_iext_irec_compact_full() will go
a step further and move part of the next buffer if all the extents wont
fit. It will then shift the remaining extents in the next buffer up to the
start of the buffer. The bug here was that we did not update er_extoff and
this caused extent list corruption.

It does not appear that this extra functionality gains us much. Calling
xfs_iext_irec_compact_pages() instead will do a good enough job at
compacting the indirect list and will be quicker too.

For the case in xfs_iext_indirect_to_direct() the total number of extents
in the indirect list will fit into one buffer so we will never need the
extra functionality of xfs_iext_irec_compact_full() there.

Also xfs_iext_irec_compact_pages() doesn't need to do a memmove() (the
buffers will never overlap) so we don't want the performance hit that can
incur.

SGI-PV: 987159

SGI-Modid: xfs-linux-melb:xfs-kern:32166a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/xfs/xfs_inode.c | 95 ++----------------------------------------------------
 1 file changed, 3 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 419cfc2eacb..dbd9cef852e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct(
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 
-	xfs_iext_irec_compact_full(ifp);
+	xfs_iext_irec_compact_pages(ifp);
 	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
 
 	ep = ifp->if_u1.if_ext_irec->er_extbuf;
@@ -4449,8 +4449,7 @@ xfs_iext_irec_remove(
  * compaction policy is as follows:
  *
  *    Full Compaction: Extents fit into a single page (or inline buffer)
- *    Full Compaction: Extents occupy less than 10% of allocated space
- * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
+ * Partial Compaction: Extents occupy less than 50% of allocated space
  *      No Compaction: Extents occupy at least 50% of allocated space
  */
 void
@@ -4471,8 +4470,6 @@ xfs_iext_irec_compact(
 		xfs_iext_direct_to_inline(ifp, nextents);
 	} else if (nextents <= XFS_LINEAR_EXTS) {
 		xfs_iext_indirect_to_direct(ifp);
-	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
-		xfs_iext_irec_compact_full(ifp);
 	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
 		xfs_iext_irec_compact_pages(ifp);
 	}
@@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages(
 		erp_next = erp + 1;
 		if (erp_next->er_extcount <=
 		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
-			memmove(&erp->er_extbuf[erp->er_extcount],
+			memcpy(&erp->er_extbuf[erp->er_extcount],
 				erp_next->er_extbuf, erp_next->er_extcount *
 				sizeof(xfs_bmbt_rec_t));
 			erp->er_extcount += erp_next->er_extcount;
@@ -4515,92 +4512,6 @@ xfs_iext_irec_compact_pages(
 	}
 }
 
-/*
- * Fully compact the extent records managed by the indirection array.
- */
-void
-xfs_iext_irec_compact_full(
-	xfs_ifork_t	*ifp)			/* inode fork pointer */
-{
-	xfs_bmbt_rec_host_t *ep, *ep_next;	/* extent record pointers */
-	xfs_ext_irec_t	*erp, *erp_next;	/* extent irec pointers */
-	int		erp_idx = 0;		/* extent irec index */
-	int		ext_avail;		/* empty entries in ex list */
-	int		ext_diff;		/* number of exts to add */
-	int		nlists;			/* number of irec's (ex lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	erp = ifp->if_u1.if_ext_irec;
-	ep = &erp->er_extbuf[erp->er_extcount];
-	erp_next = erp + 1;
-	ep_next = erp_next->er_extbuf;
-
-	while (erp_idx < nlists - 1) {
-		/*
-		 * Check how many extent records are available in this irec.
-		 * If there is none skip the whole exercise.
-		 */
-		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-		if (ext_avail) {
-
-			/*
-			 * Copy over as many as possible extent records into
-			 * the previous page.
-			 */
-			ext_diff = MIN(ext_avail, erp_next->er_extcount);
-			memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
-			erp->er_extcount += ext_diff;
-			erp_next->er_extcount -= ext_diff;
-
-			/*
-			 * If the next irec is empty now we can simply
-			 * remove it.
-			 */
-			if (erp_next->er_extcount == 0) {
-				/*
-				 * Free page before removing extent record
-				 * so er_extoffs don't get modified in
-				 * xfs_iext_irec_remove.
-				 */
-				kmem_free(erp_next->er_extbuf);
-				erp_next->er_extbuf = NULL;
-				xfs_iext_irec_remove(ifp, erp_idx + 1);
-				erp = &ifp->if_u1.if_ext_irec[erp_idx];
-				nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-			/*
-			 * If the next irec is not empty move up the content
-			 * that has not been copied to the previous page to
-			 * the beggining of this one.
-			 */
-			} else {
-				memmove(erp_next->er_extbuf, &ep_next[ext_diff],
-					erp_next->er_extcount *
-					sizeof(xfs_bmbt_rec_t));
-				ep_next = erp_next->er_extbuf;
-				memset(&ep_next[erp_next->er_extcount], 0,
-					(XFS_LINEAR_EXTS -
-						erp_next->er_extcount) *
-					sizeof(xfs_bmbt_rec_t));
-				erp_next->er_extoff += ext_diff;
-			}
-		}
-
-		if (erp->er_extcount == XFS_LINEAR_EXTS) {
-			erp_idx++;
-			if (erp_idx < nlists)
-				erp = &ifp->if_u1.if_ext_irec[erp_idx];
-			else
-				break;
-		}
-		ep = &erp->er_extbuf[erp->er_extcount];
-		erp_next = erp + 1;
-		ep_next = erp_next->er_extbuf;
-	}
-}
-
 /*
  * This is called to update the er_extoff field in the indirection
  * array when extents have been added or removed from one of the
-- 
cgit v1.2.3


From 254db57f9b12daba841a4d91ddb9a8161e9c74ba Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 26 Sep 2008 10:23:22 +0100
Subject: GFS2: Support for I/O barriers

This patch adds barrier support to GFS2. There is not a lot of change
really... we just add the barrier flag when we write journal header
blocks. If the underlying device refuses to support them, we fall back
to the previous way of doing things (wait for the I/O and hope) since
there is nothing else we can do. There is no user configuration,
barriers will always be on unless the device refuses to support them.
This seems a reasonable solution to me since this is a correctness
issue.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  1 +
 fs/gfs2/log.c    | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f1ed3a1bf8a..f566ec1b4e8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -432,6 +432,7 @@ enum {
 	SDF_JOURNAL_CHECKED	= 0,
 	SDF_JOURNAL_LIVE	= 1,
 	SDF_SHUTDOWN		= 2,
+	SDF_NOBARRIERS		= 3,
 };
 
 #define GFS2_FSNAME_LEN		256
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3a..ad305854bdc 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/bio.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	memset(bh->b_data, 0, bh->b_size);
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
-	unlock_buffer(bh);
 
 	gfs2_ail1_empty(sdp, 0);
 	tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
 	lh->lh_hash = cpu_to_be32(hash);
 
-	set_buffer_dirty(bh);
-	if (sync_dirty_buffer(bh))
+	bh->b_end_io = end_buffer_write_sync;
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		goto skip_barrier;
+	get_bh(bh);
+	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	wait_on_buffer(bh);
+	if (buffer_eopnotsupp(bh)) {
+		clear_buffer_eopnotsupp(bh);
+		set_buffer_uptodate(bh);
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+		lock_buffer(bh);
+skip_barrier:
+		get_bh(bh);
+		submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+		wait_on_buffer(bh);
+	}
+	if (!buffer_uptodate(bh))
 		gfs2_io_error_bh(sdp, bh);
 	brelse(bh);
 
-- 
cgit v1.2.3


From d0185c0882d76b8126d4a099c7ac82b3b216d103 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 29 Sep 2008 07:42:57 -0700
Subject: Fix NULL pointer dereference in proc_sys_compare

The VFS interface for the 'd_compare()' is a bit special (read: 'odd'),
because it really just essentially replaces a memcmp().  The filesystem
is supposed to just compare the two names with whatever case-independent
or other function.

And when I say 'is supposed to', I obviously mean that 'procfs does odd
things, and actually looks at the dentry that we don't even pass down,
rather than just the name'.  Which results in problems, because we
actually call d_compare before we have even verified that the dentry is
still hashed at all.

And that causes a problm since the inode that procfs looks at may have
been free'd and the d_inode pointer is NULL.  procfs just assumes that
all dentries are positive, since procfs itself never generates a
negative one.  But memory pressure will still result in the dentry
getting torn down, and as it is removed by RCU, it still remains visible
on some lists - and to d_compare.

If the filesystem just did a name comparison, we wouldn't care.  And we
could just fix procfs to know about negative dentries too.  But rather
than have the low-level filesystems know about internal VFS details,
just move the check for a unhashed dentry up a bit, so that we will only
call d_compare on dentries that are still active.

The actual oops this caused didn't look like a NULL pointer dereference
because procfs did a 'container_of(inode, struct proc_inode, vfs_inode)'
to get at its internal proc_inode information from the inode pointer,
and accessed a field below the inode. So the oops would look something
like

	BUG: unable to handle kernel paging request at fffffffffffffff0
	IP: [<ffffffff802bc6c6>] proc_sys_compare+0x36/0x50

and was seen on both x86-64 (Alexey Dobriyan and Hugh Dickins) and
ppc64 (Hugh Dickins).

Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-of-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 80e93956ace..e7a1a99b746 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		if (dentry->d_parent != parent)
 			goto next;
 
+		/* non-existing due to RCU? */
+		if (d_unhashed(dentry))
+			goto next;
+
 		/*
 		 * It is safe to compare names since d_move() cannot
 		 * change the qstr (protected by d_lock).
@@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 				goto next;
 		}
 
-		if (!d_unhashed(dentry)) {
-			atomic_inc(&dentry->d_count);
-			found = dentry;
-		}
+		atomic_inc(&dentry->d_count);
+		found = dentry;
 		spin_unlock(&dentry->d_lock);
 		break;
 next:
-- 
cgit v1.2.3


From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Sun, 28 Sep 2008 23:09:31 +0100
Subject: mm owner: fix race between swapoff and exit

There's a race between mm->owner assignment and swapoff, more easily
seen when task slab poisoning is turned on.  The condition occurs when
try_to_unuse() runs in parallel with an exiting task.  A similar race
can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats>
or ptrace or page migration.

CPU0                                    CPU1
                                        try_to_unuse
                                        looks at mm = task0->mm
                                        increments mm->mm_users
task 0 exits
mm->owner needs to be updated, but no
new owner is found (mm_users > 1, but
no other task has task->mm = task0->mm)
mm_update_next_owner() leaves
                                        mmput(mm) decrements mm->mm_users
task0 freed
                                        dereferencing mm->owner fails

The fix is to notify the subsystem via mm_owner_changed callback(),
if no new owner is found, by specifying the new task as NULL.

Jiri Slaby:
mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but
must be set after that, so as not to pass NULL as old owner causing oops.

Daisuke Nishimura:
mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task()
and its callers need to take account of this situation to avoid oops.

Hugh Dickins:
Lockdep warning and hang below exec_mmap() when testing these patches.
exit_mm() up_reads mmap_sem before calling mm_update_next_owner(),
so exec_mmap() now needs to do the same.  And with that repositioning,
there's now no point in mm_need_new_owner() allowing for NULL mm.

Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe..cecee501ce7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
-	mm_update_next_owner(old_mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
+		mm_update_next_owner(old_mm);
 		mmput(old_mm);
 		return 0;
 	}
-- 
cgit v1.2.3


From 16dbc6c9616363fe53811abcbd935336dc0a0f01 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 2 Oct 2008 14:50:12 -0700
Subject: inotify: fix lock ordering wrt do_page_fault's mmap_sem

Fix inotify lock order reversal with mmap_sem due to holding locks over
copy_to_user.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reported-by: "Daniel J Blueman" <daniel.blueman@gmail.com>
Tested-by: "Daniel J Blueman" <daniel.blueman@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inotify_user.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 60249429a25..d85c7d931cd 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
 }
 
 /*
- * remove_kevent - cleans up and ultimately frees the given kevent
+ * remove_kevent - cleans up the given kevent
  *
  * Caller must hold dev->ev_mutex.
  */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
 
 	dev->event_count--;
 	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
 
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
 	kfree(kevent->name);
 	kmem_cache_free(event_cachep, kevent);
 }
@@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
 		struct inotify_kernel_event *kevent;
 		kevent = inotify_dev_get_event(dev);
 		remove_kevent(dev, kevent);
+		free_kevent(kevent);
 	}
 }
 
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	dev = file->private_data;
 
 	while (1) {
-		int events;
 
 		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
 
 		mutex_lock(&dev->ev_mutex);
-		events = !list_empty(&dev->events);
-		mutex_unlock(&dev->ev_mutex);
-		if (events) {
+		if (!list_empty(&dev->events)) {
 			ret = 0;
 			break;
 		}
+		mutex_unlock(&dev->ev_mutex);
 
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	mutex_lock(&dev->ev_mutex);
 	while (1) {
 		struct inotify_kernel_event *kevent;
 
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 			}
 			break;
 		}
+		remove_kevent(dev, kevent);
+
+		/*
+		 * Must perform the copy_to_user outside the mutex in order
+		 * to avoid a lock order reversal with mmap_sem.
+		 */
+		mutex_unlock(&dev->ev_mutex);
 
 		if (copy_to_user(buf, &kevent->event, event_size)) {
 			ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 			count -= kevent->event.len;
 		}
 
-		remove_kevent(dev, kevent);
+		free_kevent(kevent);
+
+		mutex_lock(&dev->ev_mutex);
 	}
 	mutex_unlock(&dev->ev_mutex);
 
-- 
cgit v1.2.3


From 4b19de6d1cb07c8bcb6778e771f9cfd5bcfdfd3e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 2 Oct 2008 14:50:16 -0700
Subject: mm: tiny-shmem nommu fix

The previous patch db203d53d474aa068984e409d807628f5841da1b ("mm:
tiny-shmem fix lock ordering: mmap_sem vs i_mutex") to fix the lock
ordering in tiny-shmem breaks shared anonymous and IPC memory on NOMMU
architectures because it was using the expanding truncate to signal ramfs
to allocate a physically contiguous RAM backing the inode (otherwise it is
unusable for "memory mapping" it to userspace).

However do_truncate is what caused the lock ordering error, due to it
taking i_mutex.  In this case, we can actually just call ramfs directly to
allocate memory for the mapping, rather than go via truncate.

Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/file-nommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff..5145cb9125a 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
  * size 0 on the assumption that it's going to be used for an mmap of shared
  * memory
  */
-static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
 	struct pagevec lru_pvec;
 	unsigned long npages, xpages, loop, limit;
-- 
cgit v1.2.3


From 240799cdf22bd789ea6852653c3b879d35ad0a6c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 9 Oct 2008 23:53:47 -0400
Subject: ext4: Use readahead when reading an inode from the inode table

With modern hard drives, reading 64k takes roughly the same time as
reading a 4k block.  So request readahead for adjacent inode table
blocks to reduce the time it takes when iterating over directories
(especially when doing this in htree sort order) in a cold cache case.
With this patch, the time it takes to run "git status" on a kernel
tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches"
is reduced by 21%.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   2 +
 fs/ext4/ext4_sb.h |   1 +
 fs/ext4/inode.c   | 134 ++++++++++++++++++++++++++----------------------------
 fs/ext4/super.c   |  27 ++++++++++-
 4 files changed, 92 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 163c44527dd..922d18720c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
 /*
  * Default mount options
  */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index f92af01138d..94e0757522a 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -52,6 +52,7 @@ struct ext4_sb_info {
 	int s_desc_per_block_bits;
 	int s_inode_size;
 	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22fcbb67cd8..ef4ca3d4abc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3833,41 +3833,6 @@ out_stop:
 	ext4_journal_stop(handle);
 }
 
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
-		unsigned long ino, struct ext4_iloc *iloc)
-{
-	ext4_group_t block_group;
-	unsigned long offset;
-	ext4_fsblk_t block;
-	struct ext4_group_desc *gdp;
-
-	if (!ext4_valid_inum(sb, ino)) {
-		/*
-		 * This error is already checked for in namei.c unless we are
-		 * looking at an NFS filehandle, in which case no error
-		 * report is needed
-		 */
-		return 0;
-	}
-
-	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-	gdp = ext4_get_group_desc(sb, block_group, NULL);
-	if (!gdp)
-		return 0;
-
-	/*
-	 * Figure out the offset within the block group inode table
-	 */
-	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
-		EXT4_INODE_SIZE(sb);
-	block = ext4_inode_table(sb, gdp) +
-		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
-	iloc->block_group = block_group;
-	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
-	return block;
-}
-
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
-	ext4_fsblk_t block;
-	struct buffer_head *bh;
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
+
+	iloc->bh = 0;
+	if (!ext4_valid_inum(sb, inode->i_ino))
+		return -EIO;
 
-	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
-	if (!block)
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
 		return -EIO;
 
-	bh = sb_getblk(inode->i_sb, block);
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error (inode->i_sb, "ext4_get_inode_loc",
-				"unable to read inode block - "
-				"inode=%lu, block=%llu",
-				 inode->i_ino, block);
+		ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+			   "inode block - inode=%lu, block=%llu",
+			   inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
-			struct ext4_group_desc *desc;
-			int inodes_per_buffer;
-			int inode_offset, i;
-			ext4_group_t block_group;
-			int start;
-
-			block_group = (inode->i_ino - 1) /
-					EXT4_INODES_PER_GROUP(inode->i_sb);
-			inodes_per_buffer = bh->b_size /
-				EXT4_INODE_SIZE(inode->i_sb);
-			inode_offset = ((inode->i_ino - 1) %
-					EXT4_INODES_PER_GROUP(inode->i_sb));
-			start = inode_offset & ~(inodes_per_buffer - 1);
+			int i, start;
 
-			/* Is the inode bitmap in cache? */
-			desc = ext4_get_group_desc(inode->i_sb,
-						block_group, NULL);
-			if (!desc)
-				goto make_io;
+			start = inode_offset & ~(inodes_per_block - 1);
 
-			bitmap_bh = sb_getblk(inode->i_sb,
-				ext4_inode_bitmap(inode->i_sb, desc));
+			/* Is the inode bitmap in cache? */
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (!bitmap_bh)
 				goto make_io;
 
@@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
 				brelse(bitmap_bh);
 				goto make_io;
 			}
-			for (i = start; i < start + inodes_per_buffer; i++) {
+			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
-			if (i == start + inodes_per_buffer) {
+			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
@@ -3968,6 +3933,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		}
 
 make_io:
+		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+
+			table = ext4_inode_table(sb, gdp);
+			/* Make sure s_inode_readahead_blks is a power of 2 */
+			while (EXT4_SB(sb)->s_inode_readahead_blks &
+			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
+				EXT4_SB(sb)->s_inode_readahead_blks = 
+				   (EXT4_SB(sb)->s_inode_readahead_blks &
+				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+			if (table > b)
+				b = table;
+			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+				num -= le16_to_cpu(gdp->bg_itable_unused);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
 		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
@@ -3978,10 +3973,9 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, "ext4_get_inode_loc",
-					"unable to read inode block - "
-					"inode=%lu, block=%llu",
-					inode->i_ino, block);
+			ext4_error(sb, __func__,
+				   "unable to read inode block - inode=%lu, "
+				   "block=%llu", inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
 		}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9f5468fb06d..6583aee5177 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -515,8 +515,10 @@ static void ext4_put_super(struct super_block *sb)
 		mark_buffer_dirty(sbi->s_sbh);
 		ext4_commit_super(sb, es, 1);
 	}
-	if (sbi->s_proc)
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -779,6 +781,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		seq_printf(seq, ",inode_readahead_blks=%u",
+			   sbi->s_inode_readahead_blks);
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -913,6 +919,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_inode_readahead_blks
 };
 
 static match_table_t tokens = {
@@ -973,6 +980,7 @@ static match_table_t tokens = {
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_err, NULL},
 };
 
@@ -1381,6 +1389,13 @@ set_qf_format:
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_inode_readahead_blks:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > (1 << 30))
+				return 0;
+			sbi->s_inode_readahead_blks = option;
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1938,6 +1953,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
 
 	unlock_kernel();
@@ -2234,6 +2250,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 
+	if (sbi->s_proc)
+		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+				 &ext4_ui_proc_fops,
+				 &sbi->s_inode_readahead_blks);
+
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -2513,8 +2534,10 @@ failed_mount2:
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 failed_mount:
-	if (sbi->s_proc)
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
-- 
cgit v1.2.3


From c2ea3fde61f1df1dbf062345f23277dcd6f01dfe Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 10 Oct 2008 09:40:52 -0400
Subject: ext4: Remove old legacy block allocator

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 1355 +----------------------------------------------------
 fs/ext4/ext4.h    |    8 +-
 fs/ext4/ext4_i.h  |   35 --
 fs/ext4/ext4_sb.h |    1 -
 fs/ext4/extents.c |    9 +-
 fs/ext4/file.c    |    2 +-
 fs/ext4/ialloc.c  |    1 -
 fs/ext4/inode.c   |   33 +-
 fs/ext4/ioctl.c   |   44 --
 fs/ext4/mballoc.c |   22 +-
 fs/ext4/resize.c  |   18 +-
 fs/ext4/super.c   |   40 +-
 12 files changed, 40 insertions(+), 1528 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cca7fd53ad7..59566c082f1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	}
 	return used_blocks;
 }
+
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -345,303 +346,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	 */
 	return bh;
 }
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root:		root of per-filesystem reservation rb tree
- * @verbose:		verbose mode
- * @fn:			function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end).	Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
-			      const char *fn)
-{
-	struct rb_node *n;
-	struct ext4_reserve_window_node *rsv, *prev;
-	int bad;
-
-restart:
-	n = rb_first(root);
-	bad = 0;
-	prev = NULL;
-
-	printk(KERN_DEBUG "Block Allocation Reservation "
-	       "Windows Map (%s):\n", fn);
-	while (n) {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-		if (verbose)
-			printk(KERN_DEBUG "reservation window 0x%p "
-			       "start:  %llu, end:  %llu\n",
-			       rsv, rsv->rsv_start, rsv->rsv_end);
-		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-			printk(KERN_DEBUG "Bad reservation %p (start >= end)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (prev && prev->rsv_end >= rsv->rsv_start) {
-			printk(KERN_DEBUG "Bad reservation %p "
-			       "(prev->end >= start)\n", rsv);
-			bad = 1;
-		}
-		if (bad) {
-			if (!verbose) {
-				printk(KERN_DEBUG "Restarting reservation "
-				       "walk in verbose mode\n");
-				verbose = 1;
-				goto restart;
-			}
-		}
-		n = rb_next(n);
-		prev = rsv;
-	}
-	printk(KERN_DEBUG "Window map complete.\n");
-	BUG_ON(bad);
-}
-#define rsv_window_dump(root, verbose) \
-	__rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv:		inode's reservation window
- * @grp_goal:		given goal block relative to the allocation block group
- * @group:		the current allocation block group
- * @sb:			filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-			ext4_group_t group, struct super_block *sb)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if ((rsv->_rsv_start > group_last_block) ||
-	    (rsv->_rsv_end < group_first_block))
-		return 0;
-	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-		|| (grp_goal + group_first_block > rsv->_rsv_end)))
-		return 0;
-	return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root:		root of reservation tree
- * @goal:		target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
-	struct rb_node *n = root->rb_node;
-	struct ext4_reserve_window_node *rsv;
-
-	if (!n)
-		return NULL;
-
-	do {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-
-		if (goal < rsv->rsv_start)
-			n = n->rb_left;
-		else if (goal > rsv->rsv_end)
-			n = n->rb_right;
-		else
-			return rsv;
-	} while (n);
-	/*
-	 * We've fallen off the end of the tree: the goal wasn't inside
-	 * any particular node.  OK, the previous node must be to one
-	 * side of the interval containing the goal.  If it's the RHS,
-	 * we need to back up one.
-	 */
-	if (rsv->rsv_start > goal) {
-		n = rb_prev(&rsv->rsv_node);
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-	}
-	return rsv;
-}
-
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb:			super block
- * @rsv:		reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
-		    struct ext4_reserve_window_node *rsv)
-{
-	struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
-	struct rb_node *node = &rsv->rsv_node;
-	ext4_fsblk_t start = rsv->rsv_start;
-
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ext4_reserve_window_node *this;
-
-	while (*p)
-	{
-		parent = *p;
-		this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-
-		if (start < this->rsv_start)
-			p = &(*p)->rb_left;
-		else if (start > this->rsv_end)
-			p = &(*p)->rb_right;
-		else {
-			rsv_window_dump(root, 1);
-			BUG();
-		}
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-}
-
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb:			super block
- * @rsv:		reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
-			      struct ext4_reserve_window_node *rsv)
-{
-	rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_alloc_hit = 0;
-	rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv:		given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
-	/* a valid reservation end block could not be 0 */
-	return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext4_init_block_alloc_info()
- * @inode:		file inode structure
- *
- * Allocate and initialize the	reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct super_block *sb = inode->i_sb;
-
-	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-	if (block_i) {
-		struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-
-		rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-		rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-
-		/*
-		 * if filesystem is mounted with NORESERVATION, the goal
-		 * reservation window size is set to zero to indicate
-		 * block reservation is off
-		 */
-		if (!test_opt(sb, RESERVATION))
-			rsv->rsv_goal_size = 0;
-		else
-			rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
-		rsv->rsv_alloc_hit = 0;
-		block_i->last_alloc_logical_block = 0;
-		block_i->last_alloc_physical_block = 0;
-	}
-	ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext4_discard_reservation()
- * @inode:		inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- *	ext4_release_file(): last writer close the file
- *	ext4_clear_inode(): last iput(), when nobody link to this file.
- *	ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct ext4_reserve_window_node *rsv;
-	spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-
-	ext4_mb_discard_inode_preallocations(inode);
-
-	if (!block_i)
-		return;
-
-	rsv = &block_i->rsv_window_node;
-	if (!rsv_is_empty(&rsv->rsv_window)) {
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&rsv->rsv_window))
-			rsv_window_remove(inode->i_sb, rsv);
-		spin_unlock(rsv_lock);
-	}
-}
 
 /**
  * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -650,6 +354,13 @@ void ext4_discard_reservation(struct inode *inode)
  * @block:			start physcial block to free
  * @count:			number of blocks to free
  * @pdquot_freed_blocks:	pointer to quota
+ *
+ * XXX This function is only used by the on-line resizing code, which
+ * should probably be fixed up to call the mballoc variant.  There
+ * this needs to be cleaned up later; in fact, I'm not convinced this
+ * is 100% correct in the face of the mballoc code.  The online resizing
+ * code needs to be fixed up to more tightly (and correctly) interlock
+ * with the mballoc code.
  */
 void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 			 ext4_fsblk_t block, unsigned long count,
@@ -861,747 +572,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
 	sb = inode->i_sb;
 
-	if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
-		ext4_free_blocks_sb(handle, sb, block, count,
-						&dquot_freed_blocks);
-	else
-		ext4_mb_free_blocks(handle, inode, block, count,
-						metadata, &dquot_freed_blocks);
+	ext4_mb_free_blocks(handle, inode, block, count,
+			    metadata, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
 		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
 	return;
 }
 
-/**
- * ext4_test_allocatable()
- * @nr:			given allocation block group
- * @bh:			bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy.  This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
-{
-	int ret;
-	struct journal_head *jh = bh2jh(bh);
-
-	if (ext4_test_bit(nr, bh->b_data))
-		return 0;
-
-	jbd_lock_bh_state(bh);
-	if (!jh->b_committed_data)
-		ret = 1;
-	else
-		ret = !ext4_test_bit(nr, jh->b_committed_data);
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * bitmap_search_next_usable_block()
- * @start:		the starting block (group relative) of the search
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-					ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t next;
-	struct journal_head *jh = bh2jh(bh);
-
-	while (start < maxblocks) {
-		next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
-		if (next >= maxblocks)
-			return -1;
-		if (ext4_test_allocatable(next, bh))
-			return next;
-		jbd_lock_bh_state(bh);
-		if (jh->b_committed_data)
-			start = ext4_find_next_zero_bit(jh->b_committed_data,
-							maxblocks, next);
-		jbd_unlock_bh_state(bh);
-	}
-	return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start:		the starting block (group relative) to find next
- *			allocatable block in bitmap.
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap.  We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-			ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t here, next;
-	char *p, *r;
-
-	if (start > 0) {
-		/*
-		 * The goal was occupied; search forward for a free
-		 * block within the next XX blocks.
-		 *
-		 * end_goal is more or less random, but it has to be
-		 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
-		 * next 64-bit boundary is simple..
-		 */
-		ext4_grpblk_t end_goal = (start + 63) & ~63;
-		if (end_goal > maxblocks)
-			end_goal = maxblocks;
-		here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
-		if (here < end_goal && ext4_test_allocatable(here, bh))
-			return here;
-		ext4_debug("Bit not found near goal\n");
-	}
-
-	here = start;
-	if (here < 0)
-		here = 0;
-
-	p = ((char *)bh->b_data) + (here >> 3);
-	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
-
-	if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
-		return next;
-
-	/*
-	 * The bitmap search --- search forward alternately through the actual
-	 * bitmap and the last-committed copy until we find a bit free in
-	 * both
-	 */
-	here = bitmap_search_next_usable_block(here, bh, maxblocks);
-	return here;
-}
-
-/**
- * claim_block()
- * @block:		the free block (group relative) to allocate
- * @bh:			the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap.  Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data.  If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
-{
-	struct journal_head *jh = bh2jh(bh);
-	int ret;
-
-	if (ext4_set_bit_atomic(lock, block, bh->b_data))
-		return 0;
-	jbd_lock_bh_state(bh);
-	if (jh->b_committed_data && ext4_test_bit(block, jh->b_committed_data)) {
-		ext4_clear_bit_atomic(lock, block, bh->b_data);
-		ret = 0;
-	} else {
-		ret = 1;
-	}
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * ext4_try_to_allocate()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- *	if there is a reservation window, only try to allocate block(s) from the
- *	file's own reservation window;
- *	Otherwise, the allocation range starts from the give goal block, ends at
- *	the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal, unsigned long *count,
-			struct ext4_reserve_window *my_rsv)
-{
-	ext4_fsblk_t group_first_block;
-	ext4_grpblk_t start, end;
-	unsigned long num = 0;
-
-	/* we do allocation within the reservation window if we have a window */
-	if (my_rsv) {
-		group_first_block = ext4_group_first_block_no(sb, group);
-		if (my_rsv->_rsv_start >= group_first_block)
-			start = my_rsv->_rsv_start - group_first_block;
-		else
-			/* reservation window cross group boundary */
-			start = 0;
-		end = my_rsv->_rsv_end - group_first_block + 1;
-		if (end > EXT4_BLOCKS_PER_GROUP(sb))
-			/* reservation window crosses group boundary */
-			end = EXT4_BLOCKS_PER_GROUP(sb);
-		if ((start <= grp_goal) && (grp_goal < end))
-			start = grp_goal;
-		else
-			grp_goal = -1;
-	} else {
-		if (grp_goal > 0)
-			start = grp_goal;
-		else
-			start = 0;
-		end = EXT4_BLOCKS_PER_GROUP(sb);
-	}
-
-	BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-
-repeat:
-	if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
-		grp_goal = find_next_usable_block(start, bitmap_bh, end);
-		if (grp_goal < 0)
-			goto fail_access;
-		if (!my_rsv) {
-			int i;
-
-			for (i = 0; i < 7 && grp_goal > start &&
-					ext4_test_allocatable(grp_goal - 1,
-								bitmap_bh);
-					i++, grp_goal--)
-				;
-		}
-	}
-	start = grp_goal;
-
-	if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-		grp_goal, bitmap_bh)) {
-		/*
-		 * The block was allocated by another thread, or it was
-		 * allocated and then freed by another thread
-		 */
-		start++;
-		grp_goal++;
-		if (start >= end)
-			goto fail_access;
-		goto repeat;
-	}
-	num++;
-	grp_goal++;
-	while (num < *count && grp_goal < end
-		&& ext4_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-				grp_goal, bitmap_bh)) {
-		num++;
-		grp_goal++;
-	}
-	*count = num;
-	return grp_goal - num;
-fail_access:
-	*count = num;
-	return -1;
-}
-
-/**
- *	find_next_reservable_window():
- *		find a reservable space within the given range.
- *		It does not allocate the reservation window for now:
- *		alloc_new_reservation() will do the work later.
- *
- *	@search_head: the head of the searching list;
- *		This is not necessarily the list head of the whole filesystem
- *
- *		We have both head and start_block to assist the search
- *		for the reservable space. The list starts from head,
- *		but we will shift to the place where start_block is,
- *		then start from there, when looking for a reservable space.
- *
- *	@size: the target new reservation window size
- *
- *	@group_first_block: the first block we consider to start
- *			the real search from
- *
- *	@last_block:
- *		the maximum block number that our goal reservable space
- *		could start from. This is normally the last block in this
- *		group. The search will end when we found the start of next
- *		possible reservable space is out of this boundary.
- *		This could handle the cross boundary reservation window
- *		request.
- *
- *	basically we search from the given range, rather than the whole
- *	reservation double linked list, (start_block, last_block)
- *	to find a free region that is of my size and has not
- *	been reserved.
- *
- */
-static int find_next_reservable_window(
-				struct ext4_reserve_window_node *search_head,
-				struct ext4_reserve_window_node *my_rsv,
-				struct super_block *sb,
-				ext4_fsblk_t start_block,
-				ext4_fsblk_t last_block)
-{
-	struct rb_node *next;
-	struct ext4_reserve_window_node *rsv, *prev;
-	ext4_fsblk_t cur;
-	int size = my_rsv->rsv_goal_size;
-
-	/* TODO: make the start of the reservation window byte-aligned */
-	/* cur = *start_block & ~7;*/
-	cur = start_block;
-	rsv = search_head;
-	if (!rsv)
-		return -1;
-
-	while (1) {
-		if (cur <= rsv->rsv_end)
-			cur = rsv->rsv_end + 1;
-
-		/* TODO?
-		 * in the case we could not find a reservable space
-		 * that is what is expected, during the re-search, we could
-		 * remember what's the largest reservable space we could have
-		 * and return that one.
-		 *
-		 * For now it will fail if we could not find the reservable
-		 * space with expected-size (or more)...
-		 */
-		if (cur > last_block)
-			return -1;		/* fail */
-
-		prev = rsv;
-		next = rb_next(&rsv->rsv_node);
-		rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
-		/*
-		 * Reached the last reservation, we can just append to the
-		 * previous one.
-		 */
-		if (!next)
-			break;
-
-		if (cur + size <= rsv->rsv_start) {
-			/*
-			 * Found a reserveable space big enough.  We could
-			 * have a reservation across the group boundary here
-			 */
-			break;
-		}
-	}
-	/*
-	 * we come here either :
-	 * when we reach the end of the whole list,
-	 * and there is empty reservable space after last entry in the list.
-	 * append it to the end of the list.
-	 *
-	 * or we found one reservable space in the middle of the list,
-	 * return the reservation window that we could append to.
-	 * succeed.
-	 */
-
-	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
-		rsv_window_remove(sb, my_rsv);
-
-	/*
-	 * Let's book the whole avaliable window for now.  We will check the
-	 * disk bitmap later and then, if there are free blocks then we adjust
-	 * the window size if it's larger than requested.
-	 * Otherwise, we will remove this node from the tree next time
-	 * call find_next_reservable_window.
-	 */
-	my_rsv->rsv_start = cur;
-	my_rsv->rsv_end = cur + size - 1;
-	my_rsv->rsv_alloc_hit = 0;
-
-	if (prev != my_rsv)
-		ext4_rsv_window_add(sb, my_rsv);
-
-	return 0;
-}
-
-/**
- *	alloc_new_reservation()--allocate a new reservation window
- *
- *		To make a new reservation, we search part of the filesystem
- *		reservation list (the list that inside the group). We try to
- *		allocate a new reservation window near the allocation goal,
- *		or the beginning of the group, if there is no goal.
- *
- *		We first find a reservable space after the goal, then from
- *		there, we check the bitmap for the first free block after
- *		it. If there is no free block until the end of group, then the
- *		whole group is full, we failed. Otherwise, check if the free
- *		block is inside the expected reservable space, if so, we
- *		succeed.
- *		If the first free block is outside the reservable space, then
- *		start from the first free block, we search for next available
- *		space, and go on.
- *
- *	on succeed, a new reservation will be found and inserted into the list
- *	It contains at least one free block, and it does not overlap with other
- *	reservation windows.
- *
- *	failed: we failed to find a reservation window in this group
- *
- *	@rsv: the reservation
- *
- *	@grp_goal: The goal (group-relative).  It is where the search for a
- *		free reservable space should start from.
- *		if we have a grp_goal(grp_goal >0 ), then start from there,
- *		no grp_goal(grp_goal = -1), we start from the first block
- *		of the group.
- *
- *	@sb: the super block
- *	@group: the group we are trying to allocate in
- *	@bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
-		ext4_grpblk_t grp_goal, struct super_block *sb,
-		ext4_group_t group, struct buffer_head *bitmap_bh)
-{
-	struct ext4_reserve_window_node *search_head;
-	ext4_fsblk_t group_first_block, group_end_block, start_block;
-	ext4_grpblk_t first_free_block;
-	struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
-	unsigned long size;
-	int ret;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if (grp_goal < 0)
-		start_block = group_first_block;
-	else
-		start_block = grp_goal + group_first_block;
-
-	size = my_rsv->rsv_goal_size;
-
-	if (!rsv_is_empty(&my_rsv->rsv_window)) {
-		/*
-		 * if the old reservation is cross group boundary
-		 * and if the goal is inside the old reservation window,
-		 * we will come here when we just failed to allocate from
-		 * the first part of the window. We still have another part
-		 * that belongs to the next group. In this case, there is no
-		 * point to discard our window and try to allocate a new one
-		 * in this group(which will fail). we should
-		 * keep the reservation window, just simply move on.
-		 *
-		 * Maybe we could shift the start block of the reservation
-		 * window to the first block of next group.
-		 */
-
-		if ((my_rsv->rsv_start <= group_end_block) &&
-				(my_rsv->rsv_end > group_end_block) &&
-				(start_block >= my_rsv->rsv_start))
-			return -1;
-
-		if ((my_rsv->rsv_alloc_hit >
-		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
-			/*
-			 * if the previously allocation hit ratio is
-			 * greater than 1/2, then we double the size of
-			 * the reservation window the next time,
-			 * otherwise we keep the same size window
-			 */
-			size = size * 2;
-			if (size > EXT4_MAX_RESERVE_BLOCKS)
-				size = EXT4_MAX_RESERVE_BLOCKS;
-			my_rsv->rsv_goal_size = size;
-		}
-	}
-
-	spin_lock(rsv_lock);
-	/*
-	 * shift the search start to the window near the goal block
-	 */
-	search_head = search_reserve_window(fs_rsv_root, start_block);
-
-	/*
-	 * find_next_reservable_window() simply finds a reservable window
-	 * inside the given range(start_block, group_end_block).
-	 *
-	 * To make sure the reservation window has a free bit inside it, we
-	 * need to check the bitmap after we found a reservable window.
-	 */
-retry:
-	ret = find_next_reservable_window(search_head, my_rsv, sb,
-						start_block, group_end_block);
-
-	if (ret == -1) {
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;
-	}
-
-	/*
-	 * On success, find_next_reservable_window() returns the
-	 * reservation window where there is a reservable space after it.
-	 * Before we reserve this reservable space, we need
-	 * to make sure there is at least a free block inside this region.
-	 *
-	 * searching the first free bit on the block bitmap and copy of
-	 * last committed bitmap alternatively, until we found a allocatable
-	 * block. Search start from the start block of the reservable space
-	 * we just found.
-	 */
-	spin_unlock(rsv_lock);
-	first_free_block = bitmap_search_next_usable_block(
-			my_rsv->rsv_start - group_first_block,
-			bitmap_bh, group_end_block - group_first_block + 1);
-
-	if (first_free_block < 0) {
-		/*
-		 * no free block left on the bitmap, no point
-		 * to reserve the space. return failed.
-		 */
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;		/* failed */
-	}
-
-	start_block = first_free_block + group_first_block;
-	/*
-	 * check if the first free block is within the
-	 * free space we just reserved
-	 */
-	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
-		return 0;		/* success */
-	/*
-	 * if the first free bit we found is out of the reservable space
-	 * continue search for next reservable space,
-	 * start from where the free block is,
-	 * we also shift the list head to where we stopped last time
-	 */
-	search_head = my_rsv;
-	spin_lock(rsv_lock);
-	goto retry;
-}
-
-/**
- * try_to_extend_reservation()
- * @my_rsv:		given reservation window
- * @sb:			super block
- * @size:		the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
-			struct super_block *sb, int size)
-{
-	struct ext4_reserve_window_node *next_rsv;
-	struct rb_node *next;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	if (!spin_trylock(rsv_lock))
-		return;
-
-	next = rb_next(&my_rsv->rsv_node);
-
-	if (!next)
-		my_rsv->rsv_end += size;
-	else {
-		next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
-		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
-			my_rsv->rsv_end += size;
-		else
-			my_rsv->rsv_end = next_rsv->rsv_start - 1;
-	}
-	spin_unlock(rsv_lock);
-}
-
-/**
- * ext4_try_to_allocate_with_rsv()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- * @errp:		pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation.  If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
- *
- */
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal,
-			struct ext4_reserve_window_node *my_rsv,
-			unsigned long *count, int *errp)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-	ext4_grpblk_t ret = 0;
-	int fatal;
-	unsigned long num = *count;
-
-	*errp = 0;
-
-	/*
-	 * Make sure we use undo access for the bitmap, because it is critical
-	 * that we do the frozen_data COW on bitmap buffers in all cases even
-	 * if the buffer is in BJ_Forget state in the committing transaction.
-	 */
-	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
-	if (fatal) {
-		*errp = fatal;
-		return -1;
-	}
-
-	/*
-	 * we don't deal with reservation when
-	 * filesystem is mounted without reservation
-	 * or the file is not a regular file
-	 * or last attempt to allocate a block with reservation turned on failed
-	 */
-	if (my_rsv == NULL) {
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-						grp_goal, count, NULL);
-		goto out;
-	}
-	/*
-	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
-	 * first block is a filesystem wide block number
-	 * first block is the block number of the first block in this group
-	 */
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	/*
-	 * Basically we will allocate a new block from inode's reservation
-	 * window.
-	 *
-	 * We need to allocate a new reservation window, if:
-	 * a) inode does not have a reservation window; or
-	 * b) last attempt to allocate a block from existing reservation
-	 *    failed; or
-	 * c) we come here with a goal and with a reservation window
-	 *
-	 * We do not need to allocate a new reservation window if we come here
-	 * at the beginning with a goal and the goal is inside the window, or
-	 * we don't have a goal but already have a reservation window.
-	 * then we could go to allocate from the reservation window directly.
-	 */
-	while (1) {
-		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window,
-						grp_goal, group, sb)) {
-			if (my_rsv->rsv_goal_size < *count)
-				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-							group, bitmap_bh);
-			if (ret < 0)
-				break;			/* failed */
-
-			if (!goal_in_my_reservation(&my_rsv->rsv_window,
-							grp_goal, group, sb))
-				grp_goal = -1;
-		} else if (grp_goal >= 0) {
-			int curr = my_rsv->rsv_end -
-					(grp_goal + group_first_block) + 1;
-
-			if (curr < *count)
-				try_to_extend_reservation(my_rsv, sb,
-							*count - curr);
-		}
-
-		if ((my_rsv->rsv_start > group_last_block) ||
-				(my_rsv->rsv_end < group_first_block)) {
-			rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
-			BUG();
-		}
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-					   grp_goal, &num, &my_rsv->rsv_window);
-		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit += num;
-			*count = num;
-			break;				/* succeed */
-		}
-		num = *count;
-	}
-out:
-	if (ret >= 0) {
-		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-					"bitmap block");
-		fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
-		if (fatal) {
-			*errp = fatal;
-			return -1;
-		}
-		return ret;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext4_journal_release_buffer(handle, bitmap_bh);
-	return ret;
-}
-
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 						s64 nblocks)
 {
@@ -1702,313 +679,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-/**
- * ext4_old_new_blocks() -- core block bitmap based block allocation function
- *
- * @handle:		handle to this transaction
- * @inode:		file inode
- * @goal:		given target block(filesystem wide)
- * @count:		target number of blocks to allocate
- * @errp:		error code
- *
- * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * the block bitmap directly to do block allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If
- * that fails, it will try to allocate block(s) from other block groups
- * without any specific goal block.
- *
- * This function is called when -o nomballoc mount option is enabled
- *
- */
-ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gdp_bh;
-	ext4_group_t group_no;
-	ext4_group_t goal_group;
-	ext4_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
-	ext4_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
-	ext4_fsblk_t ret_block;		/* filesyetem-wide allocated block */
-	ext4_group_t bgi;			/* blockgroup iteration index */
-	int fatal = 0, err;
-	int performed_allocation = 0;
-	ext4_grpblk_t free_blocks;	/* number of free blocks in a group */
-	struct super_block *sb;
-	struct ext4_group_desc *gdp;
-	struct ext4_super_block *es;
-	struct ext4_sb_info *sbi;
-	struct ext4_reserve_window_node *my_rsv = NULL;
-	struct ext4_block_alloc_info *block_i;
-	unsigned short windowsz = 0;
-	ext4_group_t ngroups;
-	unsigned long num = *count;
-
-	sb = inode->i_sb;
-	if (!sb) {
-		*errp = -ENODEV;
-		printk(KERN_ERR "ext4_new_block: nonexistent superblock");
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
-		/*
-		 * With delalloc we already reserved the blocks
-		 */
-		while (*count && ext4_claim_free_blocks(sbi, *count)) {
-			/* let others to free the space */
-			yield();
-			*count = *count >> 1;
-		}
-		if (!*count) {
-			*errp = -ENOSPC;
-			return 0;	/*return with ENOSPC error */
-		}
-		num = *count;
-	}
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
-		*errp = -EDQUOT;
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	es = EXT4_SB(sb)->s_es;
-	ext4_debug("goal=%llu.\n", goal);
-	/*
-	 * Allocate a block from reservation only when
-	 * filesystem is mounted with reservation(default,-o reservation), and
-	 * it's a regular file, and
-	 * the desired window size is greater than 0 (One could use ioctl
-	 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
-	 * reservation on that particular file)
-	 */
-	block_i = EXT4_I(inode)->i_block_alloc_info;
-	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
-		my_rsv = &block_i->rsv_window_node;
-
-	/*
-	 * First, test whether the goal block is free.
-	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= ext4_blocks_count(es))
-		goal = le32_to_cpu(es->s_first_data_block);
-	ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
-	goal_group = group_no;
-retry_alloc:
-	gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-	if (!gdp)
-		goto io_error;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-
-	if (free_blocks > 0) {
-		/*
-		 * try to allocate with group target block
-		 * in the goal group. If we have low free_blocks
-		 * count turn off reservation
-		 */
-		if (my_rsv && (free_blocks < windowsz)
-			&& (rsv_is_empty(&my_rsv->rsv_window)))
-			my_rsv = NULL;
-
-		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-
-	ngroups = EXT4_SB(sb)->s_groups_count;
-	smp_rmb();
-
-	/*
-	 * Now search the rest of the groups.  We assume that
-	 * group_no and gdp correctly point to the last group visited.
-	 */
-	for (bgi = 0; bgi < ngroups; bgi++) {
-		group_no++;
-		if (group_no >= ngroups)
-			group_no = 0;
-		gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp)
-			goto io_error;
-		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-		/*
-		 * skip this group if the number of
-		 * free blocks is less than half of the reservation
-		 * window size.
-		 */
-		if (my_rsv && (free_blocks <= (windowsz/2)))
-			continue;
-
-		brelse(bitmap_bh);
-		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		/*
-		 * try to allocate block(s) from this group, without a goal(-1).
-		 */
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-	/*
-	 * We may end up a bogus ealier ENOSPC error due to
-	 * filesystem is "full" of reservations, but
-	 * there maybe indeed free blocks avaliable on disk
-	 * In this case, we just forget about the reservations
-	 * just do block allocation as without reservations.
-	 */
-	if (my_rsv) {
-		my_rsv = NULL;
-		windowsz = 0;
-		group_no = goal_group;
-		goto retry_alloc;
-	}
-	/* No space left on the device */
-	*errp = -ENOSPC;
-	goto out;
-
-allocated:
-
-	ext4_debug("using block group %lu(%d)\n",
-			group_no, gdp->bg_free_blocks_count);
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	fatal = ext4_journal_get_write_access(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-
-	if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ret_block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
-		ext4_error(sb, "ext4_new_block",
-			    "Allocating block in system zone - "
-			    "blocks from %llu, length %lu",
-			     ret_block, num);
-		/*
-		 * claim_block marked the blocks we allocated
-		 * as in use. So we may want to selectively
-		 * mark some of the blocks as free
-		 */
-		goto retry_alloc;
-	}
-
-	performed_allocation = 1;
-
-#ifdef CONFIG_JBD2_DEBUG
-	{
-		struct buffer_head *debug_bh;
-
-		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, ret_block);
-		if (debug_bh) {
-			BUFFER_TRACE(debug_bh, "state when allocated");
-			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-			brelse(debug_bh);
-		}
-	}
-	jbd_lock_bh_state(bitmap_bh);
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-		int i;
-
-		for (i = 0; i < num; i++) {
-			if (ext4_test_bit(grp_alloc_blk+i,
-					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk(KERN_ERR "%s: block was unexpectedly "
-				       "set in b_committed_data\n", __func__);
-			}
-		}
-	}
-	ext4_debug("found bit %d\n", grp_alloc_blk);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	jbd_unlock_bh_state(bitmap_bh);
-#endif
-
-	if (ret_block + num - 1 >= ext4_blocks_count(es)) {
-		ext4_error(sb, "ext4_new_block",
-			    "block(%llu) >= blocks count(%llu) - "
-			    "block_group = %lu, es == %p ", ret_block,
-			ext4_blocks_count(es), group_no, es);
-		goto out;
-	}
-
-	/*
-	 * It is up to the caller to add the new buffer to a journal
-	 * list of some description.  We don't know in advance whether
-	 * the caller wants to use it as metadata or data.
-	 */
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-	/*
-	 * Now reduce the dirty block count also. Should not go negative
-	 */
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, *count);
-	else
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks -= num;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
-	}
-
-	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
-	if (!fatal)
-		fatal = err;
-
-	sb->s_dirt = 1;
-	if (fatal)
-		goto out;
-
-	*errp = 0;
-	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
-	*count = num;
-	return ret_block;
-
-io_error:
-	*errp = -EIO;
-out:
-	if (fatal) {
-		*errp = fatal;
-		ext4_std_error(sb, fatal);
-	}
-	/*
-	 * Undo the block allocation
-	 */
-	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
-	brelse(bitmap_bh);
-	return 0;
-}
-
 #define EXT4_META_BLOCK 0x1
 
 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -2018,10 +688,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		return ext4_old_new_blocks(handle, inode, goal, count, errp);
-	}
-
 	memset(&ar, 0, sizeof(ar));
 	/* Fill with neighbour allocated blocks */
 
@@ -2242,3 +908,4 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 	return ext4_bg_num_gdb_meta(sb,group);
 
 }
+
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 922d18720c9..c50c04cc6d7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -539,7 +539,6 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
-#define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -1002,8 +1001,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 					 s64 nblocks);
@@ -1018,8 +1015,6 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext4_init_block_alloc_info(struct inode *);
-extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1054,7 +1049,7 @@ extern int ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1084,7 +1079,6 @@ extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
 extern void ext4_delete_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_discard_reservation(struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e..2875eeca172 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned long ext4_group_t;
 
-struct ext4_reserve_window {
-	ext4_fsblk_t	_rsv_start;	/* First byte reserved */
-	ext4_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
-};
-
-struct ext4_reserve_window_node {
-	struct rb_node		rsv_node;
-	__u32			rsv_goal_size;
-	__u32			rsv_alloc_hit;
-	struct ext4_reserve_window	rsv_window;
-};
-
-struct ext4_block_alloc_info {
-	/* information about reservation window */
-	struct ext4_reserve_window_node rsv_window_node;
-	/*
-	 * was i_next_alloc_block in ext4_inode_info
-	 * is the logical (file-relative) number of the
-	 * most-recently-allocated block in this file.
-	 * We use this for detecting linearly ascending allocation requests.
-	 */
-	ext4_lblk_t last_alloc_logical_block;
-	/*
-	 * Was i_next_alloc_goal in ext4_inode_info
-	 * is the *physical* companion to i_next_alloc_block.
-	 * it the physical block number of the block which was most-recentl
-	 * allocated to this file.  This give us the goal (target) for the next
-	 * allocation when we detect linearly ascending requests.
-	 */
-	ext4_fsblk_t last_alloc_physical_block;
-};
-
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
 
@@ -97,9 +65,6 @@ struct ext4_inode_info {
 	ext4_group_t	i_block_group;
 	__u32	i_state;		/* Dynamic state flags for ext4 */
 
-	/* block reservation info */
-	struct ext4_block_alloc_info *i_block_alloc_info;
-
 	ext4_lblk_t		i_dir_start_lookup;
 #ifdef CONFIG_EXT4DEV_FS_XATTR
 	/*
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 94e0757522a..6a0b40d4326 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -67,7 +67,6 @@ struct ext4_sb_info {
 	/* root of the per fs reservation window tree */
 	spinlock_t s_rsv_window_lock;
 	struct rb_root s_rsv_window_root;
-	struct ext4_reserve_window_node s_rsv_window_head;
 
 	/* Journaling */
 	struct inode *s_journal_inode;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e8758df2617..c8f81f2fb28 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2697,11 +2697,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	}
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary.
+	 * Okay, we need to do block allocation.
 	 */
-	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
 
 	/* find neighbour allocated blocks */
 	ar.lleft = iblock;
@@ -2761,7 +2758,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		/* free data blocks we just allocated */
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
-		ext4_mb_discard_inode_preallocations(inode);
+		ext4_discard_preallocations(inode);
 		ext4_free_blocks(handle, inode, ext_pblock(&newex),
 					ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
@@ -2825,7 +2822,7 @@ void ext4_ext_truncate(struct inode *inode)
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 
-	ext4_discard_reservation(inode);
+	ext4_discard_preallocations(inode);
 
 	/*
 	 * TODO: optimization is possible here.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 11b289f42b7..62796b7e1d1 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -38,7 +38,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 			(atomic_read(&inode->i_writecount) == 1))
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_reservation(inode);
+		ext4_discard_preallocations(inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	if (is_dx(inode) && filp->private_data)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5e66a2feef0..1343bf18825 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -817,7 +817,6 @@ got:
 		ei->i_flags &= ~EXT4_DIRSYNC_FL;
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
-	ei->i_block_alloc_info = NULL;
 	ei->i_block_group = group;
 
 	ext4_set_inode_flags(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ef4ca3d4abc..bd770c360c1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -486,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 		Indirect *partial)
 {
-	struct ext4_block_alloc_info *block_i;
-
-	block_i =  EXT4_I(inode)->i_block_alloc_info;
-
 	/*
-	 * try the heuristic for sequential allocation,
-	 * failing that at least try to get decent locality.
+	 * XXX need to get goal block from mballoc's data structures
 	 */
-	if (block_i && (block == block_i->last_alloc_logical_block + 1)
-		&& (block_i->last_alloc_physical_block != 0)) {
-		return block_i->last_alloc_physical_block + 1;
-	}
 
 	return ext4_find_near(inode, partial);
 }
@@ -757,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 {
 	int i;
 	int err = 0;
-	struct ext4_block_alloc_info *block_i;
 	ext4_fsblk_t current_block;
 
-	block_i = EXT4_I(inode)->i_block_alloc_info;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
@@ -786,17 +775,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 
-	/*
-	 * update the most recently allocated logical & physical block
-	 * in i_block_alloc_info, to assist find the proper goal block for next
-	 * allocation
-	 */
-	if (block_i) {
-		block_i->last_alloc_logical_block = block + blks - 1;
-		block_i->last_alloc_physical_block =
-				le32_to_cpu(where[num].key) + blks - 1;
-	}
-
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 
 	inode->i_ctime = ext4_current_time(inode);
@@ -914,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		goto cleanup;
 
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary
+	 * Okay, we need to do block allocation.
 	*/
-	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
-
 	goal = ext4_find_goal(inode, iblock, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -3738,7 +3712,7 @@ void ext4_truncate(struct inode *inode)
 	 */
 	down_write(&ei->i_data_sem);
 
-	ext4_discard_reservation(inode);
+	ext4_discard_preallocations(inode);
 
 	/*
 	 * The orphan list entry will now protect us from any crash which
@@ -4071,7 +4045,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-	ei->i_block_alloc_info = NULL;
 
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 3e14060b398..ea27eaa0cfe 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,7 +23,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
-	unsigned short rsv_window_size;
 
 	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
 
@@ -190,49 +189,6 @@ setversion_out:
 			return ret;
 		}
 #endif
-	case EXT4_IOC_GETRSVSZ:
-		if (test_opt(inode->i_sb, RESERVATION)
-			&& S_ISREG(inode->i_mode)
-			&& ei->i_block_alloc_info) {
-			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
-			return put_user(rsv_window_size, (int __user *)arg);
-		}
-		return -ENOTTY;
-	case EXT4_IOC_SETRSVSZ: {
-		int err;
-
-		if (!test_opt(inode->i_sb, RESERVATION) || !S_ISREG(inode->i_mode))
-			return -ENOTTY;
-
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
-
-		if (get_user(rsv_window_size, (int __user *)arg))
-			return -EFAULT;
-
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
-			return err;
-
-		if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
-			rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
-
-		/*
-		 * need to allocate reservation structure for this inode
-		 * before set the window size
-		 */
-		down_write(&ei->i_data_sem);
-		if (!ei->i_block_alloc_info)
-			ext4_init_block_alloc_info(inode);
-
-		if (ei->i_block_alloc_info){
-			struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
-			rsv->rsv_goal_size = rsv_window_size;
-		}
-		up_write(&ei->i_data_sem);
-		mnt_drop_write(filp->f_path.mnt);
-		return 0;
-	}
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b9118bb2993..335faee0c0f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -534,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	void *buddy;
 	void *buddy2;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	{
 		static int mb_check_counter;
 		if (mb_check_counter++ % 100 != 0)
@@ -2487,19 +2484,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned max;
 	int ret;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		return -ENOMEM;
 	}
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
@@ -2522,7 +2514,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	/* init file for buddy data */
 	ret = ext4_mb_init_backend(sb);
 	if (ret != 0) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return ret;
@@ -2544,7 +2535,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
@@ -2590,9 +2580,6 @@ int ext4_mb_release(struct super_block *sb)
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	/* release freed, non-committed blocks */
 	spin_lock(&sbi->s_md_lock);
 	list_splice_init(&sbi->s_closed_transaction,
@@ -3805,7 +3792,7 @@ out:
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -3817,7 +3804,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
 	struct ext4_buddy e4b;
 	int err;
 
-	if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+	if (!S_ISREG(inode->i_mode)) {
 		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
 		return;
 	}
@@ -4300,11 +4287,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
-					    &(ar->len), errp);
-		return block;
-	}
 	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
 		/*
 		 * With delalloc we already reserved the blocks
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b60afbcd7e4..b6ec1843a01 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -870,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * We can allocate memory for mb_alloc based on the new group
 	 * descriptor
 	 */
-	if (test_opt(sb, MBALLOC)) {
-		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-		if (err)
-			goto exit_journal;
-	}
+	err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+	if (err)
+		goto exit_journal;
+
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
@@ -1086,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	/*
 	 * Mark mballoc pages as not up to date so that they will be updated
 	 * next time they are loaded by ext4_mb_load_buddy.
+	 *
+	 * XXX Bad, Bad, BAD!!!  We should not be overloading the
+	 * Uptodate flag, particularly on thte bitmap bh, as way of
+	 * hinting to ext4_mb_load_buddy() that it needs to be
+	 * overloaded.  A user could take a LVM snapshot, then do an
+	 * on-line fsck, and clear the uptodate flag, and this would
+	 * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
 	 */
-	if (test_opt(sb, MBALLOC)) {
+	{
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		struct inode *inode = sbi->s_buddy_cache;
 		int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6583aee5177..dfcd41fafb9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -574,7 +574,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
 	ei->vfs_inode.i_data.writeback_index = 0;
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -633,7 +632,6 @@ static void destroy_inodecache(void)
 
 static void ext4_clear_inode(struct inode *inode)
 {
-	struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
 	if (EXT4_I(inode)->i_acl &&
 			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
@@ -646,10 +644,7 @@ static void ext4_clear_inode(struct inode *inode)
 		EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
 	}
 #endif
-	ext4_discard_reservation(inode);
-	EXT4_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
+	ext4_discard_preallocations(inode);
 	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
 				       &EXT4_I(inode)->jinode);
 }
@@ -760,8 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
 		seq_puts(seq, ",noextents");
-	if (!test_opt(sb, MBALLOC))
-		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 	if (!test_opt(sb, DELALLOC))
@@ -1373,12 +1366,6 @@ set_qf_format:
 		case Opt_nodelalloc:
 			clear_opt(sbi->s_mount_opt, DELALLOC);
 			break;
-		case Opt_mballoc:
-			set_opt(sbi->s_mount_opt, MBALLOC);
-			break;
-		case Opt_nomballoc:
-			clear_opt(sbi->s_mount_opt, MBALLOC);
-			break;
 		case Opt_stripe:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -2040,11 +2027,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		ext4_warning(sb, __func__,
 			"extents feature not enabled on this filesystem, "
 			"use tune2fs.\n");
-	/*
-	 * turn on mballoc code by default in ext4 filesystem
-	 * Use -o nomballoc to turn it off
-	 */
-	set_opt(sbi->s_mount_opt, MBALLOC);
 
 	/*
 	 * enable delayed allocation by default
@@ -2301,19 +2283,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount3;
 	}
 
-	/* per fileystem reservation list head & lock */
-	spin_lock_init(&sbi->s_rsv_window_lock);
-	sbi->s_rsv_window_root = RB_ROOT;
-	/* Add a single, static dummy reservation to the start of the
-	 * reservation window list --- it gives us a placeholder for
-	 * append-at-start-of-list which makes the allocation logic
-	 * _much_ simpler. */
-	sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
-	sbi->s_rsv_window_head.rsv_goal_size = 0;
-	ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 
 	/*
@@ -2510,7 +2479,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
 
 	ext4_ext_init(sb);
-	ext4_mb_init(sb, needs_recovery);
+	err = ext4_mb_init(sb, needs_recovery);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+		       err);
+		goto failed_mount4;
+	}
 
 	lock_kernel();
 	return 0;
-- 
cgit v1.2.3


From c806e68f5647109350ec546fee5b526962970fd2 Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Fri, 10 Oct 2008 08:09:18 -0400
Subject: ext4: fix initialization of UNINIT bitmap blocks

This fixes a bug which caused on-line resizing of filesystems with a
1k blocksize to fail.  The root cause of this bug was the fact that if
an uninitalized bitmap block gets read in by userspace (which
e2fsprogs does try to avoid, but can happen when the blocksize is less
than the pagesize and an adjacent blocks is read into memory)
ext4_read_block_bitmap() was erroneously depending on the buffer
uptodate flag to decide whether it needed to initialize the bitmap
block in memory --- i.e., to set the standard set of blocks in use by
a block group (superblock, bitmaps, inode table, etc.).  Essentially,
ext4_read_block_bitmap() assumed it was the only routine that might
try to read a block containing a block bitmap, which is simply not
true.

To fix this, ext4_read_block_bitmap() and ext4_read_inode_bitmap()
must always initialize uninitialized bitmap blocks.  Once a block or
inode is allocated out of that bitmap, it will be marked as
initialized in the block group descriptor, so in general this won't
result any extra unnecessary work.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 4 +++-
 fs/ext4/ialloc.c  | 4 +++-
 fs/ext4/mballoc.c | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 59566c082f1..bd2ece22882 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -319,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (bh_uptodate_or_lock(bh))
+	if (buffer_uptodate(bh) &&
+	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
 		return bh;
 
+	lock_buffer(bh);
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1343bf18825..fe34d74cfb1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (bh_uptodate_or_lock(bh))
+	if (buffer_uptodate(bh) &&
+	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
 		return bh;
 
+	lock_buffer(bh);
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 335faee0c0f..b580714f0d8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -782,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (bh[i] == NULL)
 			goto out;
 
-		if (bh_uptodate_or_lock(bh[i]))
+		if (buffer_uptodate(bh[i]) &&
+		    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
 			continue;
 
+		lock_buffer(bh[i]);
 		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
-- 
cgit v1.2.3


From 23f8b79eae8a74e42a006ffa7c456e295c7e1c0d Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 8 Oct 2008 23:28:31 -0400
Subject: jbd2: abort instead of waiting for nonexistent transaction

The __jbd2_log_wait_for_space function sits in a loop checkpointing
transactions until there is sufficient space free in the journal.
However, if there are no transactions to be processed (e.g.  because the
free space calculation is wrong due to a corrupted filesystem) it will
never progress.

Check for space being required when no transactions are outstanding and
abort the journal instead of endlessly looping.

This patch fixes the bug reported by Sami Liedes at:
http://bugzilla.kernel.org/show_bug.cgi?id=10976

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Cc: Sami Liedes <sliedes@cc.hut.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8..af4651bf357 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -126,14 +126,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 
 		/*
 		 * Test again, another process may have checkpointed while we
-		 * were waiting for the checkpoint lock
+		 * were waiting for the checkpoint lock. If there are no
+		 * outstanding transactions there is nothing to checkpoint and
+		 * we can't make progress. Abort the journal in this case.
 		 */
 		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__jbd2_log_space_left(journal) < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+
+			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
-			jbd2_log_do_checkpoint(journal);
+			if (chkpt) {
+				jbd2_log_do_checkpoint(journal);
+			} else {
+				printk(KERN_ERR "%s: no transactions\n",
+				       __func__);
+				jbd2_journal_abort(journal, 0);
+			}
+
 			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
-- 
cgit v1.2.3


From ede86cc473defab74d778aeac14b19f43129d4d1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 5 Oct 2008 20:50:06 -0400
Subject: ext4: Add debugging markers that can be used by systemtap

This debugging markers are designed to debug problems such as the
random filesystem latency problems reported by Arjan.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c      | 5 +++++
 fs/ext4/super.c      | 2 ++
 fs/jbd2/checkpoint.c | 3 +++
 fs/jbd2/commit.c     | 6 ++++++
 4 files changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index c37d1e86f51..5afe4370840 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
+	trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+		   inode->i_sb->s_id, datasync, inode->i_ino,
+		   dentry->d_parent->d_inode->i_ino);
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dfcd41fafb9..9c0214689de 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -2951,6 +2952,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	tid_t target;
 
+	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
 		if (wait)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index af4651bf357..42895d36945 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 
@@ -328,6 +329,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
+	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+		   journal->j_devname, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b091e5378fe..e91f051a985 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -368,6 +369,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
+	trace_mark(jbd2_start_commit, "dev %s transaction %d",
+		   journal->j_devname, commit_transaction->t_tid);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -985,6 +988,9 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
-- 
cgit v1.2.3


From 45a90bfd90c1215bf824c0f705b409723f52361b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 6 Oct 2008 12:04:02 -0400
Subject: jbd2: Fix buffer head leak when writing the commit block

Also make sure the buffer heads are marked clean before submitting bh
for writing.  The previous code was marking the buffer head dirty,
which would have forced an unneeded write (and seek) to the journal
for no good reason.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index e91f051a985..0d3814a35ed 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -127,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	JBUFFER_TRACE(descriptor, "submit commit block");
 	lock_buffer(bh);
-	get_bh(bh);
-	set_buffer_dirty(bh);
+	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
 	bh->b_end_io = journal_end_buffer_io_sync;
 
@@ -158,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		/* And try again, without the barrier */
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
+		clear_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);
 	}
 	*cbh = bh;
-- 
cgit v1.2.3


From 4d20c685fa365766a8f13584b4c8178a15ab7103 Mon Sep 17 00:00:00 2001
From: Kalpak Shah <kalpak.shah@sun.com>
Date: Wed, 8 Oct 2008 23:21:54 -0400
Subject: ext4: fix xattr deadlock

ext4_xattr_set_handle() eventually ends up calling
ext4_mark_inode_dirty() which tries to expand the inode by shifting
the EAs.  This leads to the xattr_sem being downed again and leading
to a deadlock.

This patch makes sure that if ext4_xattr_set_handle() is in the
call-chain, ext4_mark_inode_dirty() will not expand the inode.

Signed-off-by: Kalpak Shah <kalpak.shah@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/xattr.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208b489..362b0edd3db 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	struct ext4_xattr_block_find bs = {
 		.s = { .not_found = -ENODATA, },
 	};
+	unsigned long no_expand;
 	int error;
 
 	if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (strlen(name) > 255)
 		return -ERANGE;
 	down_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+	EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+
 	error = ext4_get_inode_loc(inode, &is.iloc);
 	if (error)
 		goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
+	if (no_expand == 0)
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
-- 
cgit v1.2.3


From c4b929b85bdb64afacbbf6453b1f2bf7e14c9e89 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 8 Oct 2008 19:44:18 -0400
Subject: vfs: vfs-level fiemap interface

Basic vfs-level fiemap infrastructure, which sets up a new ->fiemap
inode operation.

Userspace can get extent information on a file via fiemap ioctl. As input,
the fiemap ioctl takes a struct fiemap which includes an array of struct
fiemap_extent (fm_extents). Size of the extent array is passed as
fm_extent_count and number of extents returned will be written into
fm_mapped_extents. Offset and length fields on the fiemap structure
(fm_start, fm_length) describe a logical range which will be searched for
extents. All extents returned will at least partially contain this range.
The actual extent offsets and ranges returned will be unmodified from their
offset and range on-disk.

The fiemap ioctl returns '0' on success. On error, -1 is returned and errno
is set. If errno is equal to EBADR, then fm_flags will contain those flags
which were passed in which the kernel did not understand. On all other
errors, the contents of fm_extents is undefined.

As fiemap evolved, there have been many authors of the vfs patch. As far as
I can tell, the list includes:
Kalpak Shah <kalpak.shah@sun.com>
Andreas Dilger <adilger@sun.com>
Eric Sandeen <sandeen@redhat.com>
Mark Fasheh <mfasheh@suse.com>

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: linux-api@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ioctl.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d..045d9601fbb 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -16,6 +16,9 @@
 
 #include <asm/ioctls.h>
 
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
 /**
  * vfs_ioctl - call filesystem specific ioctl methods
  * @filp:	open file to invoke ioctl method on
@@ -71,6 +74,156 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
 	return put_user(res, p);
 }
 
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @logical:	Extent logical start offset, in bytes
+ * @phys:	Extent physical start offset, in bytes
+ * @len:	Extent length, in bytes
+ * @flags:	FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS	(FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS	(FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS	(FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+			    u64 phys, u64 len, u32 flags)
+{
+	struct fiemap_extent extent;
+	struct fiemap_extent *dest = fieinfo->fi_extents_start;
+
+	/* only count the extents */
+	if (fieinfo->fi_extents_max == 0) {
+		fieinfo->fi_extents_mapped++;
+		return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+	}
+
+	if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+		return 1;
+
+	if (flags & SET_UNKNOWN_FLAGS)
+		flags |= FIEMAP_EXTENT_UNKNOWN;
+	if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+		flags |= FIEMAP_EXTENT_ENCODED;
+	if (flags & SET_NOT_ALIGNED_FLAGS)
+		flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+	memset(&extent, 0, sizeof(extent));
+	extent.fe_logical = logical;
+	extent.fe_physical = phys;
+	extent.fe_length = len;
+	extent.fe_flags = flags;
+
+	dest += fieinfo->fi_extents_mapped;
+	if (copy_to_user(dest, &extent, sizeof(extent)))
+		return -EFAULT;
+
+	fieinfo->fi_extents_mapped++;
+	if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+		return 1;
+	return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @fs_flags:	Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+	u32 incompat_flags;
+
+	incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+	if (incompat_flags) {
+		fieinfo->fi_flags = incompat_flags;
+		return -EBADR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	*new_len = len;
+
+	if (len == 0)
+		return -EINVAL;
+
+	if (start > sb->s_maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if ((len > sb->s_maxbytes) ||
+	    (sb->s_maxbytes - len) < start)
+		*new_len = sb->s_maxbytes - start;
+
+	return 0;
+}
+
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (!inode->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+			   sizeof(struct fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -80,6 +233,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD:
-- 
cgit v1.2.3


From 00dc417fa3e763345b34ccb6034d72de76eea0a1 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Fri, 3 Oct 2008 17:32:11 -0400
Subject: ocfs2: fiemap support

Plug ocfs2 into ->fiemap. Some portions of ocfs2_get_clusters() had to be
refactored so that the extent cache can be skipped in favor of going
directly to the on-disk records. This makes it easier for us to determine
which extent is the last one in the btree. Also, I'm not sure we want to be
caching fiemap lookups anyway as they're not directly related to data
read/write.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: ocfs2-devel@oss.oracle.com
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ocfs2/alloc.c      |   9 --
 fs/ocfs2/alloc.h      |   9 ++
 fs/ocfs2/extent_map.c | 346 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/ocfs2/extent_map.h |   3 +
 fs/ocfs2/file.c       |   1 +
 5 files changed, 306 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e06..29ff57ec5d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -989,15 +989,6 @@ out:
 	return ret;
 }
 
-/*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
-	return !rec->e_leaf_clusters;
-}
-
 /*
  * This function will discard the rightmost extent record.
  */
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd801..60cd3d59230 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -146,4 +146,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
 		return le16_to_cpu(rec->e_leaf_clusters);
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326f..aed268e80b4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/fiemap.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
@@ -282,6 +284,51 @@ out:
 		kfree(new_emi);
 }
 
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
+			       &eb_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ret = -EROFS;
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		goto out;
+	}
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -373,42 +420,28 @@ out:
 	return ret;
 }
 
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-		       u32 *p_cluster, u32 *num_clusters,
-		       unsigned int *extent_flags)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
 {
-	int ret, i;
-	unsigned int flags = 0;
-	struct buffer_head *di_bh = NULL;
-	struct buffer_head *eb_bh = NULL;
+	int i, ret, tree_height, len;
 	struct ocfs2_dinode *di;
-	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *uninitialized_var(eb);
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
-	u32 coff;
-
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = -ERANGE;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
-				      num_clusters, extent_flags);
-	if (ret == 0)
-		goto out;
+	struct buffer_head *eb_bh = NULL;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
 
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
 
-	if (el->l_tree_depth) {
+	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -431,46 +464,143 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
-		 * A hole was found. Return some canned values that
-		 * callers can key on. If asked for, num_clusters will
-		 * be populated with the size of the hole.
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a seperate
+		 * field.
 		 */
-		*p_cluster = 0;
-		if (num_clusters) {
+		if (hole_len) {
 			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-							 v_cluster,
-							 num_clusters);
+							 v_cluster, &len);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			*hole_len = len;
 		}
-	} else {
-		rec = &el->l_recs[i];
+		goto out_hole;
+	}
 
-		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+	rec = &el->l_recs[i];
 
-		if (!rec->e_blkno) {
-			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-				    "record (%u, %u, 0)", inode->i_ino,
-				    le32_to_cpu(rec->e_cpos),
-				    ocfs2_rec_clusters(el, rec));
-			ret = -EROFS;
-			goto out;
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
 		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
 
-		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
 
-		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
-						    le64_to_cpu(rec->e_blkno));
-		*p_cluster = *p_cluster + coff;
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
 
-		if (num_clusters)
-			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		flags = rec->e_flags;
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		ocfs2_extent_map_insert_rec(inode, rec);
+	if (rec.e_blkno == 0ULL) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			*num_clusters = hole_len;
+		}
+	} else {
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
+
+		ocfs2_extent_map_insert_rec(inode, &rec);
 	}
 
 	if (extent_flags)
@@ -478,7 +608,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 out:
 	brelse(di_bh);
-	brelse(eb_bh);
 	return ret;
 }
 
@@ -521,3 +650,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 out:
 	return ret;
 }
+
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data separately.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	mapping_end -= cpos;
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a2..1b97490e1ea 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				u64 *ret_count, unsigned int *extent_flags);
 
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3da..ed38796052d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2228,6 +2228,7 @@ const struct inode_operations ocfs2_file_iops = {
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
 	.fallocate	= ocfs2_fallocate,
+	.fiemap		= ocfs2_fiemap,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
-- 
cgit v1.2.3


From 68c9d702bb72f367f3b148963ec6cf5e07ff7f65 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 3 Oct 2008 17:32:43 -0400
Subject: generic block based fiemap implementation

Any block based fs (this patch includes ext3) just has to declare its own
fiemap() function and then call this generic function with its own
get_block_t. This works well for block based filesystems that will map
multiple contiguous blocks at one time, but will work for filesystems that
only map one block at a time, you will just end up with an "extent" for each
block. One gotcha is this will not play nicely where there is hole+data
after the EOF. This function will assume its hit the end of the data as soon
as it hits a hole after the EOF, so if there is any data past that it will
not pick that up. AFAIK no block based fs does this anyway, but its in the
comments of the function anyway just in case.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ext2/ext2.h  |   2 +
 fs/ext2/file.c  |   1 +
 fs/ext2/inode.c |   8 ++++
 fs/ext3/file.c  |   1 +
 fs/ext3/inode.c |   8 ++++
 fs/ioctl.c      | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 138 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33..bae998c1e44 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
 int __ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c3629..45ed0712218 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.permission	= ext2_permission,
+	.fiemap		= ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6dfeb51..7658b33e265 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/fiemap.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
 
 }
 
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext2_get_block);
+}
+
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d301..3be1e0689c9 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext3_permission,
+	.fiemap		= ext3_fiemap,
 };
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d8689b11..ebfec4d0148 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/fiemap.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -981,6 +982,13 @@ out:
 	return ret;
 }
 
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext3_get_block);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 045d9601fbb..33a6b7ecb8b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,6 +13,8 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 
 #include <asm/ioctls.h>
 
@@ -224,6 +226,122 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	return error;
 }
 
+#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+
+/*
+ * @inode - the inode to map
+ * @arg - the pointer to userspace where we copy everything to
+ * @get_block - the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes.  Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size
+ */
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	unsigned int start_blk;
+	long long length = 0, map_len = 0;
+	u64 logical = 0, phys = 0, size = 0;
+	u32 flags = FIEMAP_EXTENT_MERGED;
+	int ret = 0;
+
+	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+		return ret;
+
+	start_blk = logical_to_blk(inode, start);
+
+	/* guard against change */
+	mutex_lock(&inode->i_mutex);
+
+	length = (long long)min_t(u64, len, i_size_read(inode));
+	map_len = length;
+
+	do {
+		/*
+		 * we set b_size to the total size we want so it will map as
+		 * many contiguous blocks as possible at once
+		 */
+		memset(&tmp, 0, sizeof(struct buffer_head));
+		tmp.b_size = map_len;
+
+		ret = get_block(inode, start_blk, &tmp, 0);
+		if (ret)
+			break;
+
+		/* HOLE */
+		if (!buffer_mapped(&tmp)) {
+			/*
+			 * first hole after going past the EOF, this is our
+			 * last extent
+			 */
+			if (length <= 0) {
+				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				break;
+			}
+
+			length -= blk_to_logical(inode, 1);
+
+			/* if we have holes up to/past EOF then we're done */
+			if (length <= 0)
+				break;
+
+			start_blk++;
+		} else {
+			if (length <= 0 && size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				if (ret)
+					break;
+			}
+
+			logical = blk_to_logical(inode, start_blk);
+			phys = blk_to_logical(inode, tmp.b_blocknr);
+			size = tmp.b_size;
+			flags = FIEMAP_EXTENT_MERGED;
+
+			length -= tmp.b_size;
+			start_blk += logical_to_blk(inode, size);
+
+			/*
+			 * if we are past the EOF we need to loop again to see
+			 * if there is a hole so we can mark this extent as the
+			 * last one, and if not keep mapping things until we
+			 * find a hole, or we run out of slots in the extent
+			 * array
+			 */
+			if (length <= 0)
+				continue;
+
+			ret = fiemap_fill_next_extent(fieinfo, logical, phys,
+						      size, flags);
+			if (ret)
+				break;
+		}
+		cond_resched();
+	} while (1);
+
+	mutex_unlock(&inode->i_mutex);
+
+	/* if ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
-- 
cgit v1.2.3


From 6b37faa175311128dc920aaa57a5f7fab85537d7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 6 Oct 2008 21:54:41 +0000
Subject: [CIFS] fix some settings of cifsAttrs after calling SetFileInfo and
 SetPathInfo

We only need to set them when we call SetFileInfo or SetPathInfo
directly, and as soon as possible after then. We had one place setting
it where it didn't need to be, and another place where it was missing.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index db091c516c2..e387ed3f944 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -729,7 +729,10 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 				     &info_buf, cifs_sb->local_nls,
 				     cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != -EOPNOTSUPP && rc != -EINVAL)
+		if (rc == 0) {
+			cifsInode->cifsAttrs = dosattr;
+			goto out;
+		} else if (rc != -EOPNOTSUPP && rc != -EINVAL)
 			goto out;
 	}
 
@@ -805,6 +808,7 @@ cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
 	kfree(info_buf);
 	if (rc != 0)
 		goto out_close;
+	cifsInode->cifsAttrs = dosattr;
 
 	/* silly-rename the file */
 	CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
@@ -905,7 +909,6 @@ psx_del_no_retry:
 			if (rc == 0)
 				drop_nlink(inode);
 		}
-		cifsInode->cifsAttrs = dosattr;
 	}
 out_reval:
 	if (inode) {
@@ -963,7 +966,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
 
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
-	int rc = 0;
+	int rc = 0, tmprc;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
@@ -1025,6 +1028,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 				kfree(pInfo);
 				goto mkdir_get_info;
 			}
+
 			/* Is an i_ino of zero legal? */
 			/* Are there sanity checks we can use to ensure that
 			   the server is really filling in that field? */
@@ -1113,12 +1117,20 @@ mkdir_get_info:
 			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
 			    (mode & S_IWUGO) == 0) {
 				FILE_BASIC_INFO pInfo;
+				struct cifsInodeInfo *cifsInode;
+				u32 dosattrs;
+
 				memset(&pInfo, 0, sizeof(pInfo));
-				pInfo.Attributes = cpu_to_le32(ATTR_READONLY);
-				CIFSSMBSetPathInfo(xid, pTcon, full_path,
-						&pInfo, cifs_sb->local_nls,
+				cifsInode = CIFS_I(newinode);
+				dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+				pInfo.Attributes = cpu_to_le32(dosattrs);
+				tmprc = CIFSSMBSetPathInfo(xid, pTcon,
+						full_path, &pInfo,
+						cifs_sb->local_nls,
 						cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
+				if (tmprc == 0)
+					cifsInode->cifsAttrs = dosattrs;
 			}
 			if (direntry->d_inode) {
 				if (cifs_sb->mnt_cifs_flags &
-- 
cgit v1.2.3


From 6873fa0de14e49c433f1f181c54e511f4f3d459d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 7 Oct 2008 00:46:36 -0400
Subject: Hook ext4 to the vfs fiemap interface.

ext4_ext_walk_space() was reinstated to be used for iterating over file
extents with a callback; it is used by the ext4 fiemap implementation.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ext4/ext4.h         |   2 +
 fs/ext4/ext4_extents.h |  15 +++
 fs/ext4/extents.c      | 248 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/file.c         |   4 +
 fs/ext4/inode.c        |   4 +-
 5 files changed, 271 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c50c04cc6d7..f46a513a515 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1067,6 +1067,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create);
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 				ext4_lblk_t iblock, unsigned long maxblocks,
 				struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index d33dc56d698..bec7ce59fc0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
 #define EXT4_EXT_CACHE_GAP	1
 #define EXT4_EXT_CACHE_EXTENT	2
 
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+					struct ext4_ext_cache *,
+					struct ext4_extent *, void *);
+
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
 
 #define EXT_MAX_BLOCK	0xffffffff
 
@@ -224,6 +237,8 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
 				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
+							ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
 extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c8f81f2fb28..ea2ce3c0ae6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/falloc.h>
 #include <asm/uaccess.h>
+#include <linux/fiemap.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
@@ -1626,6 +1627,113 @@ cleanup:
 	return err;
 }
 
+int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			ext4_lblk_t num, ext_prepare_callback func,
+			void *cbdata)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_ext_cache cbex;
+	struct ext4_extent *ex;
+	ext4_lblk_t next, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
+
+	while (block < last && block != EXT_MAX_BLOCK) {
+		num = last - block;
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, block, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		BUG_ON(path[depth].p_hdr == NULL);
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
+			cbex.ec_type = EXT4_EXT_CACHE_GAP;
+		} else {
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
+			cbex.ec_start = ext_pblock(ex);
+			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+		}
+
+		BUG_ON(cbex.ec_len == 0);
+		err = func(inode, path, &cbex, ex, cbdata);
+		ext4_ext_drop_refs(path);
+
+		if (err < 0)
+			break;
+
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
+		}
+
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		block = cbex.ec_block + cbex.ec_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
+}
+
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 			__u32 len, ext4_fsblk_t start, int type)
@@ -2971,3 +3079,143 @@ retry:
 	mutex_unlock(&inode->i_mutex);
 	return ret > 0 ? ret2 : ret;
 }
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+		       void *data)
+{
+	struct fiemap_extent_info *fieinfo = data;
+	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+	__u64	logical;
+	__u64	physical;
+	__u64	length;
+	__u32	flags = 0;
+	int	error;
+
+	logical =  (__u64)newex->ec_block << blksize_bits;
+
+	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+		pgoff_t offset;
+		struct page *page;
+		struct buffer_head *bh = NULL;
+
+		offset = logical >> PAGE_SHIFT;
+		page = find_get_page(inode->i_mapping, offset);
+		if (!page || !page_has_buffers(page))
+			return EXT_CONTINUE;
+
+		bh = page_buffers(page);
+
+		if (!bh)
+			return EXT_CONTINUE;
+
+		if (buffer_delay(bh)) {
+			flags |= FIEMAP_EXTENT_DELALLOC;
+			page_cache_release(page);
+		} else {
+			page_cache_release(page);
+			return EXT_CONTINUE;
+		}
+	}
+
+	physical = (__u64)newex->ec_start << blksize_bits;
+	length =   (__u64)newex->ec_len << blksize_bits;
+
+	if (ex && ext4_ext_is_uninitialized(ex))
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+	/*
+	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
+	 *
+	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
+	 * this also indicates no more allocated blocks.
+	 *
+	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
+	 */
+	if (logical + length - 1 == EXT_MAX_BLOCK ||
+	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+		flags |= FIEMAP_EXTENT_LAST;
+
+	error = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, flags);
+	if (error < 0)
+		return error;
+	if (error == 1)
+		return EXT_BREAK;
+
+	return EXT_CONTINUE;
+}
+
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_LAST;
+	int blockbits = inode->i_sb->s_blocksize_bits;
+	int error = 0;
+
+	/* in-inode? */
+	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+		struct ext4_iloc iloc;
+		int offset;	/* offset of xattr in inode */
+
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error)
+			return error;
+		physical = iloc.bh->b_blocknr << blockbits;
+		offset = EXT4_GOOD_OLD_INODE_SIZE +
+				EXT4_I(inode)->i_extra_isize;
+		physical += offset;
+		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+	} else { /* external block */
+		physical = EXT4_I(inode)->i_file_acl << blockbits;
+		length = inode->i_sb->s_blocksize;
+	}
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	ext4_lblk_t start_blk;
+	ext4_lblk_t len_blks;
+	int error = 0;
+
+	/* fallback to generic here if not in extents fmt */
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return generic_block_fiemap(inode, fieinfo, start, len,
+			ext4_get_block);
+
+	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+		return -EBADR;
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		error = ext4_xattr_fiemap(inode, fieinfo);
+	} else {
+		start_blk = start >> inode->i_sb->s_blocksize_bits;
+		len_blks = len >> inode->i_sb->s_blocksize_bits;
+
+		/*
+		 * Walk the extent tree gathering extent information.
+		 * ext4_ext_fiemap_cb will push extents back to user.
+		 */
+		down_write(&EXT4_I(inode)->i_data_sem);
+		error = ext4_ext_walk_space(inode, start_blk, len_blks,
+					  ext4_ext_fiemap_cb, fieinfo);
+		up_write(&EXT4_I(inode)->i_data_sem);
+	}
+
+	return error;
+}
+
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 62796b7e1d1..6d5be156202 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len);
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
 #endif
 	.permission	= ext4_permission,
 	.fallocate	= ext4_fallocate,
+	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bd770c360c1..a4747867411 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1135,8 +1135,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
-static int ext4_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	int ret = 0, started = 0;
-- 
cgit v1.2.3


From 39d80c33a068d9fa63a36c3b2c0d718d38440dd1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 6 Oct 2008 21:37:44 -0400
Subject: ext4: Avoid double dirtying of super block in ext4_put_super()

While reading code I noticed that ext4_put_super() dirties the
superblock bh twice. It is always done in ext4_commit_super()
too. Remove the redundant dirty operation.
Should be a nop semantically.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/ext4/super.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9c0214689de..7d865608e81 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -512,8 +512,6 @@ static void ext4_put_super(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-		mark_buffer_dirty(sbi->s_sbh);
 		ext4_commit_super(sb, es, 1);
 	}
 	if (sbi->s_proc) {
-- 
cgit v1.2.3


From 6050247d8089037d6d8ea0f3c62fe4a931c1ab14 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 7 Oct 2008 18:42:52 +0000
Subject: [CIFS]  clean up error handling in cifs_unlink

Currently, if a standard delete fails and we end up getting -EACCES
we try to clear ATTR_READONLY and try the delete again. If that
then fails with -ETXTBSY then we try a rename_pending_delete. We
aren't handling other errors appropriately though.

Another client could have deleted the file in the meantime and
we get back -ENOENT, for instance. In that case we wouldn't do a
d_drop. Instead of retrying in a separate call, simply goto the
original call and use the error handling from that.

Also, we weren't properly undoing any attribute changes that
were done before returning an error back to the caller.

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e387ed3f944..a8c833345fc 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -837,12 +837,12 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	int xid;
 	char *full_path = NULL;
 	struct inode *inode = dentry->d_inode;
-	struct cifsInodeInfo *cifsInode;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
 	struct super_block *sb = dir->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *tcon = cifs_sb->tcon;
-	struct iattr *attrs;
-	__u32 dosattr;
+	struct iattr *attrs = NULL;
+	__u32 dosattr = 0, origattr = 0;
 
 	cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
 
@@ -867,8 +867,10 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 			goto psx_del_no_retry;
 	}
 
+retry_std_delete:
 	rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
 psx_del_no_retry:
 	if (!rc) {
 		if (inode)
@@ -879,8 +881,7 @@ psx_del_no_retry:
 		rc = cifs_rename_pending_delete(full_path, inode, xid);
 		if (rc == 0)
 			drop_nlink(inode);
-	} else if (rc == -EACCES) {
-		/* try only if r/o attribute set in local lookup data? */
+	} else if (rc == -EACCES && dosattr == 0) {
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
 		if (attrs == NULL) {
 			rc = -ENOMEM;
@@ -888,28 +889,25 @@ psx_del_no_retry:
 		}
 
 		/* try to reset dos attributes */
-		cifsInode = CIFS_I(inode);
-		dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+		origattr = cifsInode->cifsAttrs;
+		if (origattr == 0)
+			origattr |= ATTR_NORMAL;
+		dosattr = origattr & ~ATTR_READONLY;
 		if (dosattr == 0)
 			dosattr |= ATTR_NORMAL;
 		dosattr |= ATTR_HIDDEN;
 
 		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
-		kfree(attrs);
 		if (rc != 0)
 			goto out_reval;
-		rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
-				    cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			if (inode)
-				drop_nlink(inode);
-		} else if (rc == -ETXTBSY) {
-			rc = cifs_rename_pending_delete(full_path, inode, xid);
-			if (rc == 0)
-				drop_nlink(inode);
-		}
+
+		goto retry_std_delete;
 	}
+
+	/* undo the setattr if we errored out and it's needed */
+	if (rc != 0 && dosattr != 0)
+		cifs_set_file_info(inode, attrs, xid, full_path, origattr);
+
 out_reval:
 	if (inode) {
 		cifsInode = CIFS_I(inode);
@@ -919,9 +917,10 @@ out_reval:
 	}
 	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
 	cifsInode = CIFS_I(dir);
-	cifsInode->time = 0;	/* force revalidate of dir as well */
+	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
 
 	kfree(full_path);
+	kfree(attrs);
 	FreeXid(xid);
 	return rc;
 }
-- 
cgit v1.2.3


From 0752f1522a9120f731232919f7ad904e9e22b8ce Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 7 Oct 2008 20:03:33 +0000
Subject: [CIFS] make sure we have the right resume info before calling
 CIFSFindNext

When we do a seekdir() or equivalent, we usually end up doing a
FindFirst call and then call FindNext until we get to the offset that we
want. The problem is that when we call FindNext, the code usually
doesn't have the proper info (mostly, the filename of the entry from the
last search) to resume the search.

Add a "last_entry" field to the cifs_search_info that points to the last
entry in the search. We calculate this pointer by using the
LastNameOffset field from the search parms that are returned. We then
use that info to do a cifs_save_resume_key before we call CIFSFindNext.

This patch allows CIFS to reliably pass the "telldir" connectathon test.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
CC: Stable <stable@kernel.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |   1 +
 fs/cifs/cifssmb.c  |   4 ++
 fs/cifs/readdir.c  | 128 +++++++++++++++++++++++++++--------------------------
 3 files changed, 70 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8dfd6f24d48..0d22479d99b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -309,6 +309,7 @@ struct cifs_search_info {
 	__u32 resume_key;
 	char *ntwrk_buf_start;
 	char *srch_entries_start;
+	char *last_entry;
 	char *presume_name;
 	unsigned int resume_name_len;
 	bool endOfSearch:1;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7504d1514c7..7b00a16e135 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3636,6 +3636,8 @@ findFirstRetry:
 					le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
+			psrch_inf->last_entry = psrch_inf->srch_entries_start +
+					le16_to_cpu(parms->LastNameOffset);
 			*pnetfid = parms->SearchHandle;
 		} else {
 			cifs_buf_release(pSMB);
@@ -3751,6 +3753,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 						le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry +=
 				psrch_inf->entries_in_buffer;
+			psrch_inf->last_entry = psrch_inf->srch_entries_start +
+					le16_to_cpu(parms->LastNameOffset);
 /*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
 	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5f40ed3473f..765adf12d54 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
 
 }
 
+static int cifs_save_resume_key(const char *current_entry,
+	struct cifsFileInfo *cifsFile)
+{
+	int rc = 0;
+	unsigned int len = 0;
+	__u16 level;
+	char *filename;
+
+	if ((cifsFile == NULL) || (current_entry == NULL))
+		return -EINVAL;
+
+	level = cifsFile->srch_inf.info_level;
+
+	if (level == SMB_FIND_FILE_UNIX) {
+		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+
+		filename = &pFindData->FileName[0];
+		if (cifsFile->srch_inf.unicode) {
+			len = cifs_unicode_bytelen(filename);
+		} else {
+			/* BB should we make this strnlen of PATH_MAX? */
+			len = strnlen(filename, PATH_MAX);
+		}
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+		FILE_DIRECTORY_INFO *pFindData =
+			(FILE_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+		FILE_FULL_DIRECTORY_INFO *pFindData =
+			(FILE_FULL_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+		SEARCH_ID_FULL_DIR_INFO *pFindData =
+			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+		FILE_BOTH_DIRECTORY_INFO *pFindData =
+			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		/* one byte length, no name conversion */
+		len = (unsigned int)pFindData->FileNameLength;
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+	} else {
+		cFYI(1, ("Unknown findfirst level %d", level));
+		return -EINVAL;
+	}
+	cifsFile->srch_inf.resume_name_len = len;
+	cifsFile->srch_inf.presume_name = filename;
+	return rc;
+}
+
 /* find the corresponding entry in the search */
 /* Note that the SMB server returns search entries for . and .. which
    complicates logic here if we choose to parse for them and we do not
@@ -703,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
 	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
 		cFYI(1, ("calling findnext2"));
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
 		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
 				  &cifsFile->srch_inf);
 		if (rc)
@@ -919,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	return rc;
 }
 
-static int cifs_save_resume_key(const char *current_entry,
-	struct cifsFileInfo *cifsFile)
-{
-	int rc = 0;
-	unsigned int len = 0;
-	__u16 level;
-	char *filename;
-
-	if ((cifsFile == NULL) || (current_entry == NULL))
-		return -EINVAL;
-
-	level = cifsFile->srch_inf.info_level;
-
-	if (level == SMB_FIND_FILE_UNIX) {
-		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-
-		filename = &pFindData->FileName[0];
-		if (cifsFile->srch_inf.unicode) {
-			len = cifs_unicode_bytelen(filename);
-		} else {
-			/* BB should we make this strnlen of PATH_MAX? */
-			len = strnlen(filename, PATH_MAX);
-		}
-		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
-		FILE_DIRECTORY_INFO *pFindData =
-			(FILE_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-		FILE_FULL_DIRECTORY_INFO *pFindData =
-			(FILE_FULL_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-		SEARCH_ID_FULL_DIR_INFO *pFindData =
-			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-		FILE_BOTH_DIRECTORY_INFO *pFindData =
-			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-		FIND_FILE_STANDARD_INFO *pFindData =
-			(FIND_FILE_STANDARD_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		/* one byte length, no name conversion */
-		len = (unsigned int)pFindData->FileNameLength;
-		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-	} else {
-		cFYI(1, ("Unknown findfirst level %d", level));
-		return -EINVAL;
-	}
-	cifsFile->srch_inf.resume_name_len = len;
-	cifsFile->srch_inf.presume_name = filename;
-	return rc;
-}
 
 int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
-- 
cgit v1.2.3


From b77d753c413e02559669df66e543869dad40c847 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 8 Oct 2008 19:13:46 +0000
Subject: [CIFS] Check that last search entry resume key is valid

Jeff's recent patch to add a last_entry field in the search structure
to better construct resume keys did not validate that the server
sent us a plausible pointer to the last entry.  This adds that.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7b00a16e135..6f4ffe15d68 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3614,6 +3614,8 @@ findFirstRetry:
 		/* BB remember to free buffer if error BB */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc == 0) {
+			unsigned int lnoff;
+
 			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
 				psrch_inf->unicode = true;
 			else
@@ -3636,8 +3638,17 @@ findFirstRetry:
 					le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+			      lnoff) {
+				cERROR(1, ("ignoring corrupt resume name"));
+				psrch_inf->last_entry = NULL;
+				return rc;
+			}
+
 			psrch_inf->last_entry = psrch_inf->srch_entries_start +
-					le16_to_cpu(parms->LastNameOffset);
+							lnoff;
+
 			*pnetfid = parms->SearchHandle;
 		} else {
 			cifs_buf_release(pSMB);
@@ -3727,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc == 0) {
+			unsigned int lnoff;
+
 			/* BB fixme add lock for file (srch_info) struct here */
 			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
 				psrch_inf->unicode = true;
@@ -3753,8 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 						le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry +=
 				psrch_inf->entries_in_buffer;
-			psrch_inf->last_entry = psrch_inf->srch_entries_start +
-					le16_to_cpu(parms->LastNameOffset);
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+			      lnoff) {
+				cERROR(1, ("ignoring corrupt resume name"));
+				psrch_inf->last_entry = NULL;
+				return rc;
+			} else
+				psrch_inf->last_entry =
+					psrch_inf->srch_entries_start + lnoff;
+
 /*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
 	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
 
-- 
cgit v1.2.3


From 8c540a96c175bdf55bda8707db04cec78b816454 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 5 Aug 2008 18:05:46 +0100
Subject: Let the block device know when sectors can be discarded

[hirofumi@mail.parknet.co.jp: discard _after_ checking for corrupt chains]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fat/fatent.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7..fb98b3d847e 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 	struct fat_entry fatent;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	int i, err, nr_bhs;
+	int first_cl = cluster;
 
 	nr_bhs = 0;
 	fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		}
 
+		/* 
+		 * Issue discard for the sectors we no longer care about,
+		 * batching contiguous clusters into one request
+		 */
+		if (cluster != fatent.entry + 1) {
+			int nr_clus = fatent.entry - first_cl + 1;
+
+			sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+					 nr_clus * sbi->sec_per_clus);
+			first_cl = cluster;
+		}
+
 		ops->ent_put(&fatent, FAT_ENT_FREE);
 		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
-- 
cgit v1.2.3


From b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 15 Aug 2008 10:15:19 +0200
Subject: block: drop virtual merging accounting

Remove virtual merge accounting.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 3cba7ae34d7..4ac7c59d1c6 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -350,8 +350,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 */
 
 	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_hw_segments >= q->max_hw_segments
-	       || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
+	       || bio->bi_hw_segments >= q->max_hw_segments) {
 
 		if (retried_segments)
 			return 0;
@@ -395,8 +394,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	}
 
 	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
-	    BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
+	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
 		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 	bio->bi_vcnt++;
-- 
cgit v1.2.3


From 5df97b91b5d7ed426034fcc84cb6e7cf682b8838 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 15 Aug 2008 10:20:02 +0200
Subject: drop vmerge accounting

Remove hw_segments field from struct bio and struct request. Without virtual
merge accounting they have no purpose.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 4ac7c59d1c6..bee4deca774 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -208,14 +208,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 	return bio->bi_phys_segments;
 }
 
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-		blk_recount_segments(q, bio);
-
-	return bio->bi_hw_segments;
-}
-
 /**
  * 	__bio_clone	-	clone a bio
  * 	@bio: destination bio
@@ -350,7 +342,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 */
 
 	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_hw_segments >= q->max_hw_segments) {
+	       || bio->bi_phys_segments >= q->max_hw_segments) {
 
 		if (retried_segments)
 			return 0;
@@ -399,7 +391,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
-	bio->bi_hw_segments++;
  done:
 	bio->bi_size += len;
 	return len;
@@ -1381,7 +1372,6 @@ EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
 EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
-- 
cgit v1.2.3


From ec2cdedf798385a9397ac50dd0405dd658f8529c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:30:15 +0900
Subject: block: allow deleting zero length partition

delete_partition() was noop for zero length partition.  As the
addition code allows creating zero lenght partition and deletion is
assumed to always succeed, this causes memory leak for zero length
partitions.  Allow zero length partitions to end their meaningless
lives.

While at it, allow deleting zero lenght partition via
BLKPG_DEL_PARTITION ioctl too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ecc3330972e..68f3e41ae66 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -325,8 +325,6 @@ void delete_partition(struct gendisk *disk, int part)
 
 	if (!p)
 		return;
-	if (!p->nr_sects)
-		return;
 	disk->part[part-1] = NULL;
 	p->start_sect = 0;
 	p->nr_sects = 0;
-- 
cgit v1.2.3


From 88e341261ca4d39eec21b212961c77eff51105f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:30:16 +0900
Subject: block: update add_partition() error handling

d805dda4 tried to fix error case handling in add_partition() but had a
few problems.

* disk->part[] entry is set early and left dangling if operation
  fails.

* Once device initialized, the last put_device() is responsible for
  freeing all the resources.  The failure path freed part_stats and p
  regardless of put_device() causing double free.

* holders subdir holds reference to the disk device, so failure path
  should remove it to release resources properly which was missing.

This patch fixes the above problems and while at it move partition
slot busy check into add_partition() for completeness and inlines
holders subdirectory creation.  Using separate function for it just
obfuscates the code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Abdel Benamrouche <draconux@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 68f3e41ae66..16f98d82460 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -300,15 +300,6 @@ struct device_type part_type = {
 	.release	= part_release,
 };
 
-static inline void partition_sysfs_add_subdir(struct hd_struct *p)
-{
-	struct kobject *k;
-
-	k = kobject_get(&p->dev.kobj);
-	p->holder_dir = kobject_create_and_add("holders", k);
-	kobject_put(k);
-}
-
 static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
 	struct kobject *k;
@@ -347,13 +338,16 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	struct hd_struct *p;
 	int err;
 
+	if (disk->part[part - 1])
+		return -EBUSY;
+
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
-		goto out0;
+		goto out_free;
 	}
 	p->start_sect = start;
 	p->nr_sects = len;
@@ -372,34 +366,42 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	p->dev.class = &block_class;
 	p->dev.type = &part_type;
 	p->dev.parent = &disk->dev;
-	disk->part[part-1] = p;
 
 	/* delay uevent until 'holders' subdir is created */
 	p->dev.uevent_suppress = 1;
 	err = device_add(&p->dev);
 	if (err)
-		goto out1;
-	partition_sysfs_add_subdir(p);
+		goto out_put;
+
+	err = -ENOMEM;
+	p->holder_dir = kobject_create_and_add("holders", &p->dev.kobj);
+	if (!p->holder_dir)
+		goto out_del;
+
 	p->dev.uevent_suppress = 0;
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		err = device_create_file(&p->dev, &dev_attr_whole_disk);
 		if (err)
-			goto out2;
+			goto out_del;
 	}
 
+	/* everything is up and running, commence */
+	disk->part[part - 1] = p;
+
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
 		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
 
 	return 0;
 
-out2:
+out_free:
+	kfree(p);
+	return err;
+out_del:
+	kobject_put(p->holder_dir);
 	device_del(&p->dev);
-out1:
+out_put:
 	put_device(&p->dev);
-	free_part_stats(p);
-out0:
-	kfree(p);
 	return err;
 }
 
-- 
cgit v1.2.3


From cf771cb5a7b716f3f9e532fd42a1e3a0a75adec5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:01:09 +0200
Subject: block: make variable and argument names more consistent

In hd_struct, @partno is used to denote partition number and a number
of other places use @part to denote hd_struct.  Functions use @part
and @index instead.  This causes confusion and makes it difficult to
use consistent variable names for hd_struct.  Always use @partno if a
variable represents partition number.

Also, print out functions use @f or @part for seq_file argument.  Use
@seqf uniformly instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        |  8 ++++----
 fs/partitions/check.c | 33 +++++++++++++++++----------------
 2 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e04..de0776cd721 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -930,7 +930,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	struct module *owner = NULL;
 	struct gendisk *disk;
 	int ret;
-	int part;
+	int partno;
 	int perm = 0;
 
 	if (file->f_mode & FMODE_READ)
@@ -949,7 +949,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
 	lock_kernel();
-	disk = get_gendisk(bdev->bd_dev, &part);
+	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk) {
 		unlock_kernel();
 		bdput(bdev);
@@ -961,7 +961,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
-		if (!part) {
+		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev->bd_inode, file);
@@ -989,7 +989,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (ret)
 				goto out_first;
 			bdev->bd_contains = whole;
-			p = disk->part[part - 1];
+			p = disk->part[partno - 1];
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
 			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 16f98d82460..b86aab1b0df 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,22 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
  * a pointer to that same buffer (for convenience).
  */
 
-char *disk_name(struct gendisk *hd, int part, char *buf)
+char *disk_name(struct gendisk *hd, int partno, char *buf)
 {
-	if (!part)
+	if (!partno)
 		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
 	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part);
+		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
 	else
-		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part);
+		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
 
 	return buf;
 }
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
-	return disk_name(bdev->bd_disk, part, buf);
+	int partno = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
+	return disk_name(bdev->bd_disk, partno, buf);
 }
 
 EXPORT_SYMBOL(bdevname);
@@ -310,13 +310,13 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	kobject_put(k);
 }
 
-void delete_partition(struct gendisk *disk, int part)
+void delete_partition(struct gendisk *disk, int partno)
 {
-	struct hd_struct *p = disk->part[part-1];
+	struct hd_struct *p = disk->part[partno - 1];
 
 	if (!p)
 		return;
-	disk->part[part-1] = NULL;
+	disk->part[partno - 1] = NULL;
 	p->start_sect = 0;
 	p->nr_sects = 0;
 	part_stat_set_all(p, 0);
@@ -333,12 +333,13 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int partno,
+		  sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	int err;
 
-	if (disk->part[part - 1])
+	if (disk->part[partno - 1])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -351,18 +352,18 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	}
 	p->start_sect = start;
 	p->nr_sects = len;
-	p->partno = part;
+	p->partno = partno;
 	p->policy = disk->policy;
 
 	if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
 		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-		"%sp%d", disk->dev.bus_id, part);
+		"%sp%d", disk->dev.bus_id, partno);
 	else
 		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-			 "%s%d", disk->dev.bus_id, part);
+			 "%s%d", disk->dev.bus_id, partno);
 
 	device_initialize(&p->dev);
-	p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
+	p->dev.devt = MKDEV(disk->major, disk->first_minor + partno);
 	p->dev.class = &block_class;
 	p->dev.type = &part_type;
 	p->dev.parent = &disk->dev;
@@ -386,7 +387,7 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	}
 
 	/* everything is up and running, commence */
-	disk->part[part - 1] = p;
+	disk->part[partno - 1] = p;
 
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
-- 
cgit v1.2.3


From f331c0296f2a9fee0d396a70598b954062603015 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:01:48 +0200
Subject: block: don't depend on consecutive minor space

* Implement disk_devt() and part_devt() and use them to directly
  access devt instead of computing it from ->major and ->first_minor.

  Note that all references to ->major and ->first_minor outside of
  block layer is used to determine devt of the disk (the part0) and as
  ->major and ->first_minor will continue to represent devt for the
  disk, converting these users aren't strictly necessary.  However,
  convert them for consistency.

* Implement disk_max_parts() to avoid directly deferencing
  genhd->minors.

* Update bdget_disk() such that it doesn't assume consecutive minor
  space.

* Move devt computation from register_disk() to add_disk() and make it
  the only one (all other usages use the initially determined value).

These changes clean up the code and will help disk->part dereference
fix and extended block device numbers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        |  2 +-
 fs/partitions/check.c | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index de0776cd721..72e0a2887cb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -892,7 +892,7 @@ int check_disk_change(struct block_device *bdev)
 
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (bdev->bd_disk->minors > 1)
+	if (disk_max_parts(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 	return 1;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b86aab1b0df..e77fa144a07 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -134,7 +134,11 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int partno = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
+	int partno = 0;
+
+	if (bdev->bd_part)
+		partno = bdev->bd_part->partno;
+
 	return disk_name(bdev->bd_disk, partno, buf);
 }
 
@@ -169,7 +173,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
 
-	state->limit = hd->minors;
+	state->limit = disk_max_parts(hd) + 1;
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
@@ -416,7 +420,6 @@ void register_disk(struct gendisk *disk)
 	int err;
 
 	disk->dev.parent = disk->driverfs_dev;
-	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
 
 	strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
 	/* ewww... some of these buggers have / in the name... */
@@ -440,7 +443,7 @@ void register_disk(struct gendisk *disk)
 	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
-	if (disk->minors == 1)
+	if (!disk_max_parts(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
@@ -463,8 +466,8 @@ exit:
 	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
-	for (i = 1; i < disk->minors; i++) {
-		p = disk->part[i-1];
+	for (i = 0; i < disk_max_parts(disk); i++) {
+		p = disk->part[i];
 		if (!p || !p->nr_sects)
 			continue;
 		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
@@ -482,7 +485,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (res)
 		return res;
 	bdev->bd_invalidated = 0;
-	for (p = 1; p < disk->minors; p++)
+	for (p = 1; p <= disk_max_parts(disk); p++)
 		delete_partition(disk, p);
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
@@ -545,7 +548,7 @@ void del_gendisk(struct gendisk *disk)
 	int p;
 
 	/* invalidate stuff */
-	for (p = disk->minors - 1; p > 0; p--) {
+	for (p = disk_max_parts(disk); p > 0; p--) {
 		invalidate_partition(disk, p);
 		delete_partition(disk, p);
 	}
-- 
cgit v1.2.3


From e71bf0d0ee89e51b92776391c5634938236977d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:03:02 +0200
Subject: block: fix disk->part[] dereferencing race

disk->part[] is protected by its matching bdev's lock.  However,
non-critical accesses like collecting stats and printing out sysfs and
proc information used to be performed without any locking.  As
partitions can come and go dynamically, partitions can go away
underneath those non-critical accesses.  As some of those accesses are
writes, this theoretically can lead to silent corruption.

This patch fixes the race by using RCU for the partition array and dev
reference counter to hold partitions.

* Rename disk->part[] to disk->__part[] to make sure no one outside
  genhd layer proper accesses it directly.

* Use RCU for disk->__part[] dereferencing.

* Implement disk_{get|put}_part() which can be used to get and put
  partitions from gendisk respectively.

* Iterators are implemented to help iterate through all partitions
  safely.

* Functions which require RCU readlock are marked with _rcu suffix.

* Use disk_put_part() in __blkdev_put() instead of directly putting
  the contained kobject.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        | 15 +++++------
 fs/partitions/check.c | 70 +++++++++++++++++++++++++++++++++------------------
 2 files changed, 53 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 72e0a2887cb..2f2873b9a04 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -929,6 +929,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
 	struct module *owner = NULL;
 	struct gendisk *disk;
+	struct hd_struct *part = NULL;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -978,7 +979,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (bdev->bd_invalidated)
 				rescan_partitions(disk, bdev);
 		} else {
-			struct hd_struct *p;
 			struct block_device *whole;
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
@@ -989,16 +989,16 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (ret)
 				goto out_first;
 			bdev->bd_contains = whole;
-			p = disk->part[partno - 1];
+			part = disk_get_part(disk, partno);
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
-			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+			if (!(disk->flags & GENHD_FL_UP) ||
+			    !part || !part->nr_sects) {
 				ret = -ENXIO;
 				goto out_first;
 			}
-			kobject_get(&p->dev.kobj);
-			bdev->bd_part = p;
-			bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+			bdev->bd_part = part;
+			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
 		}
 	} else {
 		put_disk(disk);
@@ -1027,6 +1027,7 @@ out_first:
 		__blkdev_put(bdev->bd_contains, 1);
 	bdev->bd_contains = NULL;
 	put_disk(disk);
+	disk_put_part(part);
 	module_put(owner);
 out:
 	mutex_unlock(&bdev->bd_mutex);
@@ -1119,7 +1120,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		module_put(owner);
 
 		if (bdev->bd_contains != bdev) {
-			kobject_put(&bdev->bd_part->dev.kobj);
+			disk_put_part(bdev->bd_part);
 			bdev->bd_part = NULL;
 		}
 		bdev->bd_disk = NULL;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e77fa144a07..96c8bf41e45 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -314,19 +314,29 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	kobject_put(k);
 }
 
+static void delete_partition_rcu_cb(struct rcu_head *head)
+{
+	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+
+	part->start_sect = 0;
+	part->nr_sects = 0;
+	part_stat_set_all(part, 0);
+	put_device(&part->dev);
+}
+
 void delete_partition(struct gendisk *disk, int partno)
 {
-	struct hd_struct *p = disk->part[partno - 1];
+	struct hd_struct *part;
 
-	if (!p)
+	part = disk->__part[partno-1];
+	if (!part)
 		return;
-	disk->part[partno - 1] = NULL;
-	p->start_sect = 0;
-	p->nr_sects = 0;
-	part_stat_set_all(p, 0);
-	kobject_put(p->holder_dir);
-	device_del(&p->dev);
-	put_device(&p->dev);
+
+	rcu_assign_pointer(disk->__part[partno-1], NULL);
+	kobject_put(part->holder_dir);
+	device_del(&part->dev);
+
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -343,7 +353,7 @@ int add_partition(struct gendisk *disk, int partno,
 	struct hd_struct *p;
 	int err;
 
-	if (disk->part[partno - 1])
+	if (disk->__part[partno - 1])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -391,7 +401,8 @@ int add_partition(struct gendisk *disk, int partno,
 	}
 
 	/* everything is up and running, commence */
-	disk->part[partno - 1] = p;
+	INIT_RCU_HEAD(&p->rcu_head);
+	rcu_assign_pointer(disk->__part[partno - 1], p);
 
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
@@ -414,9 +425,9 @@ out_put:
 void register_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	char *s;
-	int i;
-	struct hd_struct *p;
 	int err;
 
 	disk->dev.parent = disk->driverfs_dev;
@@ -466,16 +477,16 @@ exit:
 	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
-	for (i = 0; i < disk_max_parts(disk); i++) {
-		p = disk->part[i];
-		if (!p || !p->nr_sects)
-			continue;
-		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
-	}
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part->dev.kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
 }
 
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	struct parsed_partitions *state;
 	int p, res;
 
@@ -485,8 +496,12 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (res)
 		return res;
 	bdev->bd_invalidated = 0;
-	for (p = 1; p <= disk_max_parts(disk); p++)
-		delete_partition(disk, p);
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	while ((part = disk_part_iter_next(&piter)))
+		delete_partition(disk, part->partno);
+	disk_part_iter_exit(&piter);
+
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
@@ -545,13 +560,18 @@ EXPORT_SYMBOL(read_dev_sector);
 
 void del_gendisk(struct gendisk *disk)
 {
-	int p;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 
 	/* invalidate stuff */
-	for (p = disk_max_parts(disk); p > 0; p--) {
-		invalidate_partition(disk, p);
-		delete_partition(disk, p);
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	while ((part = disk_part_iter_next(&piter))) {
+		invalidate_partition(disk, part->partno);
+		delete_partition(disk, part->partno);
 	}
+	disk_part_iter_exit(&piter);
+
 	invalidate_partition(disk, 0);
 	disk->capacity = 0;
 	disk->flags &= ~GENHD_FL_UP;
-- 
cgit v1.2.3


From c9959059161ddd7bf4670cf47367033d6b2f79c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:21 +0900
Subject: block: fix diskstats access

There are two variants of stat functions - ones prefixed with double
underbars which don't care about preemption and ones without which
disable preemption before manipulating per-cpu counters.  It's unclear
whether the underbarred ones assume that preemtion is disabled on
entry as some callers don't do that.

This patch unifies diskstats access by implementing disk_stat_lock()
and disk_stat_unlock() which take care of both RCU (for partition
access) and preemption (for per-cpu counter access).  diskstats access
should always be enclosed between the two functions.  As such, there's
no need for the versions which disables preemption.  They're removed
and double underbars ones are renamed to drop the underbars.  As an
extra argument is added, there's no danger of using the old version
unconverted.

disk_stat_lock() uses get_cpu() and returns the cpu index and all
diskstat functions which access per-cpu counters now has @cpu
argument to help RT.

This change adds RCU or preemption operations at some places but also
collapses several preemption ops into one at others.  Overall, the
performance difference should be negligible as all involved ops are
very lightweight per-cpu ones.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 96c8bf41e45..c442f0aadac 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,10 +219,11 @@ static ssize_t part_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
+	int cpu;
 
-	preempt_disable();
-	part_round_stats(p);
-	preempt_enable();
+	cpu = disk_stat_lock();
+	part_round_stats(cpu, p);
+	disk_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
-- 
cgit v1.2.3


From bcce3de1be61e424deef35d1e86e86a35c4b6e65 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:22 +0900
Subject: block: implement extended dev numbers

Implement extended device numbers.  A block driver can tell block
layer that it wants to use extended device numbers.  After the usual
minor space is used up, block layer automatically allocates devt's
from EXT_BLOCK_MAJOR.

Currently only one major number is allocated for this but as the
allocation is strictly on-demand, ~1mil minor space under it should
suffice unless the system actually has more than ~1mil partitions and
if that ever happens adding more majors to the extended devt area is
easy.

Due to internal implementation issues, the first partition can't be
allocated on the extended area.  In other words, genhd->minors should
at least be 1.  This limitation will be lifted by later changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index c442f0aadac..0d4b7f28f13 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -333,6 +333,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	if (!part)
 		return;
 
+	blk_free_devt(part_devt(part));
 	rcu_assign_pointer(disk->__part[partno-1], NULL);
 	kobject_put(part->holder_dir);
 	device_del(&part->dev);
@@ -352,6 +353,7 @@ int add_partition(struct gendisk *disk, int partno,
 		  sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
+	dev_t devt = MKDEV(0, 0);
 	int err;
 
 	if (disk->__part[partno - 1])
@@ -378,11 +380,15 @@ int add_partition(struct gendisk *disk, int partno,
 			 "%s%d", disk->dev.bus_id, partno);
 
 	device_initialize(&p->dev);
-	p->dev.devt = MKDEV(disk->major, disk->first_minor + partno);
 	p->dev.class = &block_class;
 	p->dev.type = &part_type;
 	p->dev.parent = &disk->dev;
 
+	err = blk_alloc_devt(p, &devt);
+	if (err)
+		goto out_put;
+	p->dev.devt = devt;
+
 	/* delay uevent until 'holders' subdir is created */
 	p->dev.uevent_suppress = 1;
 	err = device_add(&p->dev);
@@ -419,6 +425,7 @@ out_del:
 	device_del(&p->dev);
 out_put:
 	put_device(&p->dev);
+	blk_free_devt(devt);
 	return err;
 }
 
-- 
cgit v1.2.3


From ed9e1982347b36573cd622ee5f4e2a7ccd79b3fd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:05 +0900
Subject: block: implement and use {disk|part}_to_dev()

Implement {disk|part}_to_dev() and use them to access generic device
instead of directly dereferencing {disk|part}->dev.  To make sure no
user is left behind, rename generic devices fields to __dev.

This is in preparation of unifying partition 0 handling with other
partitions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        |  4 +--
 fs/partitions/check.c | 79 +++++++++++++++++++++++++++------------------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2f2873b9a04..a02df22f37c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -543,9 +543,9 @@ EXPORT_SYMBOL(bd_release);
 static struct kobject *bdev_get_kobj(struct block_device *bdev)
 {
 	if (bdev->bd_contains != bdev)
-		return kobject_get(&bdev->bd_part->dev.kobj);
+		return kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 	else
-		return kobject_get(&bdev->bd_disk->dev.kobj);
+		return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj);
 }
 
 static struct kobject *bdev_get_holder(struct block_device *bdev)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0d4b7f28f13..ac0df3acdcd 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -309,7 +309,7 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
 	struct kobject *k;
 
-	k = kobject_get(&disk->dev.kobj);
+	k = kobject_get(&disk_to_dev(disk)->kobj);
 	disk->holder_dir = kobject_create_and_add("holders", k);
 	disk->slave_dir = kobject_create_and_add("slaves", k);
 	kobject_put(k);
@@ -322,7 +322,7 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
-	put_device(&part->dev);
+	put_device(part_to_dev(part));
 }
 
 void delete_partition(struct gendisk *disk, int partno)
@@ -336,7 +336,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	blk_free_devt(part_devt(part));
 	rcu_assign_pointer(disk->__part[partno-1], NULL);
 	kobject_put(part->holder_dir);
-	device_del(&part->dev);
+	device_del(part_to_dev(part));
 
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
@@ -354,6 +354,9 @@ int add_partition(struct gendisk *disk, int partno,
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
+	struct device *ddev = disk_to_dev(disk);
+	struct device *pdev;
+	const char *dname;
 	int err;
 
 	if (disk->__part[partno - 1])
@@ -367,42 +370,43 @@ int add_partition(struct gendisk *disk, int partno,
 		err = -ENOMEM;
 		goto out_free;
 	}
+	pdev = part_to_dev(p);
+
 	p->start_sect = start;
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = disk->policy;
 
-	if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
-		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-		"%sp%d", disk->dev.bus_id, partno);
+	dname = dev_name(ddev);
+	if (isdigit(dname[strlen(dname) - 1]))
+		snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
 	else
-		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-			 "%s%d", disk->dev.bus_id, partno);
+		snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
 
-	device_initialize(&p->dev);
-	p->dev.class = &block_class;
-	p->dev.type = &part_type;
-	p->dev.parent = &disk->dev;
+	device_initialize(pdev);
+	pdev->class = &block_class;
+	pdev->type = &part_type;
+	pdev->parent = ddev;
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_put;
-	p->dev.devt = devt;
+		goto out_free;
+	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
-	p->dev.uevent_suppress = 1;
-	err = device_add(&p->dev);
+	pdev->uevent_suppress = 1;
+	err = device_add(pdev);
 	if (err)
 		goto out_put;
 
 	err = -ENOMEM;
-	p->holder_dir = kobject_create_and_add("holders", &p->dev.kobj);
+	p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
 	if (!p->holder_dir)
 		goto out_del;
 
-	p->dev.uevent_suppress = 0;
+	pdev->uevent_suppress = 0;
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
-		err = device_create_file(&p->dev, &dev_attr_whole_disk);
+		err = device_create_file(pdev, &dev_attr_whole_disk);
 		if (err)
 			goto out_del;
 	}
@@ -412,8 +416,8 @@ int add_partition(struct gendisk *disk, int partno,
 	rcu_assign_pointer(disk->__part[partno - 1], p);
 
 	/* suppress uevent if the disk supresses it */
-	if (!disk->dev.uevent_suppress)
-		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+	if (!ddev->uevent_suppress)
+		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
 	return 0;
 
@@ -422,9 +426,9 @@ out_free:
 	return err;
 out_del:
 	kobject_put(p->holder_dir);
-	device_del(&p->dev);
+	device_del(pdev);
 out_put:
-	put_device(&p->dev);
+	put_device(pdev);
 	blk_free_devt(devt);
 	return err;
 }
@@ -432,30 +436,31 @@ out_put:
 /* Not exported, helper to add_disk(). */
 void register_disk(struct gendisk *disk)
 {
+	struct device *ddev = disk_to_dev(disk);
 	struct block_device *bdev;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 	char *s;
 	int err;
 
-	disk->dev.parent = disk->driverfs_dev;
+	ddev->parent = disk->driverfs_dev;
 
-	strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
+	strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
 	/* ewww... some of these buggers have / in the name... */
-	s = strchr(disk->dev.bus_id, '/');
+	s = strchr(ddev->bus_id, '/');
 	if (s)
 		*s = '!';
 
 	/* delay uevents, until we scanned partition table */
-	disk->dev.uevent_suppress = 1;
+	ddev->uevent_suppress = 1;
 
-	if (device_add(&disk->dev))
+	if (device_add(ddev))
 		return;
 #ifndef CONFIG_SYSFS_DEPRECATED
-	err = sysfs_create_link(block_depr, &disk->dev.kobj,
-				kobject_name(&disk->dev.kobj));
+	err = sysfs_create_link(block_depr, &ddev->kobj,
+				kobject_name(&ddev->kobj));
 	if (err) {
-		device_del(&disk->dev);
+		device_del(ddev);
 		return;
 	}
 #endif
@@ -481,13 +486,13 @@ void register_disk(struct gendisk *disk)
 
 exit:
 	/* announce disk after possible partitions are created */
-	disk->dev.uevent_suppress = 0;
-	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
+	ddev->uevent_suppress = 0;
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
 	disk_part_iter_init(&piter, disk, 0);
 	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part->dev.kobj, KOBJ_ADD);
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
 }
 
@@ -518,7 +523,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		return -EIO;
 
 	/* tell userspace that the media / partition table may have changed */
-	kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
@@ -591,7 +596,7 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
-	sysfs_remove_link(block_depr, disk->dev.bus_id);
+	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 #endif
-	device_del(&disk->dev);
+	device_del(disk_to_dev(disk));
 }
-- 
cgit v1.2.3


From b5d0b9df0ba5d9a044f3a21e7544f53d90bd1465 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:06:42 +0200
Subject: block: introduce partition 0

genhd and partition code handled disk and partitions separately.  All
information about the whole disk was in struct genhd and partitions in
struct hd_struct.  However, the whole disk (part0) and other
partitions have a lot in common and the data structures end up having
good number of common fields and thus separate code paths doing the
same thing.  Also, the partition array was indexed by partno - 1 which
gets pretty confusing at times.

This patch introduces partition 0 and makes the partition array
indexed by partno.  Following patches will unify the handling of disk
and parts piece-by-piece.

This patch also implements disk_partitionable() which tests whether a
disk is partitionable.  With coming dynamic partition array change,
the most common usage of disk_max_parts() will be testing whether a
disk is partitionable and the number of max partitions will become
much less important.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        |  2 +-
 fs/partitions/check.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a02df22f37c..c982a910797 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -892,7 +892,7 @@ int check_disk_change(struct block_device *bdev)
 
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (disk_max_parts(bdev->bd_disk))
+	if (disk_partitionable(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 	return 1;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ac0df3acdcd..b60699c271a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -173,7 +173,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
 
-	state->limit = disk_max_parts(hd) + 1;
+	state->limit = disk_max_parts(hd);
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
@@ -329,12 +329,12 @@ void delete_partition(struct gendisk *disk, int partno)
 {
 	struct hd_struct *part;
 
-	part = disk->__part[partno-1];
+	part = disk->__part[partno];
 	if (!part)
 		return;
 
 	blk_free_devt(part_devt(part));
-	rcu_assign_pointer(disk->__part[partno-1], NULL);
+	rcu_assign_pointer(disk->__part[partno], NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
@@ -359,7 +359,7 @@ int add_partition(struct gendisk *disk, int partno,
 	const char *dname;
 	int err;
 
-	if (disk->__part[partno - 1])
+	if (disk->__part[partno])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -413,7 +413,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	/* everything is up and running, commence */
 	INIT_RCU_HEAD(&p->rcu_head);
-	rcu_assign_pointer(disk->__part[partno - 1], p);
+	rcu_assign_pointer(disk->__part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
 	if (!ddev->uevent_suppress)
@@ -467,7 +467,7 @@ void register_disk(struct gendisk *disk)
 	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
-	if (!disk_max_parts(disk))
+	if (!disk_partitionable(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
-- 
cgit v1.2.3


From 80795aefb76d10c5d698e60c7e7750b5330787da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:07 +0900
Subject: block: move capacity from disk to part0

Move disk->capacity to part0->nr_sects and convert all users who
directly accessed the field to use {get|set}_capacity().  This is done
early to allow the __dev field to be moved.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b60699c271a..902b95f1f9d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -586,7 +586,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_exit(&piter);
 
 	invalidate_partition(disk, 0);
-	disk->capacity = 0;
+	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
 	disk_stat_set_all(disk, 0);
-- 
cgit v1.2.3


From e56105214943ce5f0901d20e972a7cfd0d1d0656 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:09 +0900
Subject: block: unify sysfs size node handling

Now that capacity and __dev are moved to part0, part0 and others can
share the same method.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 902b95f1f9d..24d2c56d7d2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -208,8 +208,8 @@ static ssize_t part_start_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
 }
 
-static ssize_t part_size_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_size_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
-- 
cgit v1.2.3


From b7db9956e57c8151b930d5e5fe5c766e6aad3ff7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:10 +0900
Subject: block: move policy from disk to part0

Move disk->policy to part0->policy.  Implement and use get_disk_ro().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 24d2c56d7d2..ace6d03602c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -375,7 +375,7 @@ int add_partition(struct gendisk *disk, int partno,
 	p->start_sect = start;
 	p->nr_sects = len;
 	p->partno = partno;
-	p->policy = disk->policy;
+	p->policy = get_disk_ro(disk);
 
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
-- 
cgit v1.2.3


From 4c46501d1659475dc6c89554af6ce7fe6ecf615c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:11 +0900
Subject: block: move holder_dir from disk to part0

Move disk->holder_dir to part0->holder_dir.  Kill now mostly
superflous bdev_get_holder().

While at it, kill superflous kobject_get/put() around holder_dir,
slave_dir and cmd_filter creation and collapse
disk_sysfs_add_subdirs() into register_disk().  These serve no purpose
but obfuscating the code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        | 10 +---------
 fs/partitions/check.c | 15 +++------------
 2 files changed, 4 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index c982a910797..57d57264285 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -548,14 +548,6 @@ static struct kobject *bdev_get_kobj(struct block_device *bdev)
 		return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj);
 }
 
-static struct kobject *bdev_get_holder(struct block_device *bdev)
-{
-	if (bdev->bd_contains != bdev)
-		return kobject_get(bdev->bd_part->holder_dir);
-	else
-		return kobject_get(bdev->bd_disk->holder_dir);
-}
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
@@ -608,7 +600,7 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
 	if (!bo->sdev)
 		goto fail_put_hdev;
 
-	bo->hdir = bdev_get_holder(bdev);
+	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
 	if (!bo->hdir)
 		goto fail_put_sdev;
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ace6d03602c..f0f604950ff 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -305,16 +305,6 @@ struct device_type part_type = {
 	.release	= part_release,
 };
 
-static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
-{
-	struct kobject *k;
-
-	k = kobject_get(&disk_to_dev(disk)->kobj);
-	disk->holder_dir = kobject_create_and_add("holders", k);
-	disk->slave_dir = kobject_create_and_add("slaves", k);
-	kobject_put(k);
-}
-
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
@@ -464,7 +454,8 @@ void register_disk(struct gendisk *disk)
 		return;
 	}
 #endif
-	disk_sysfs_add_subdirs(disk);
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	/* No minors to use for partitions */
 	if (!disk_partitionable(disk))
@@ -592,7 +583,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_stat_set_all(disk, 0);
 	disk->stamp = 0;
 
-	kobject_put(disk->holder_dir);
+	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
-- 
cgit v1.2.3


From 0762b8bde9729f10f8e6249809660ff2ec3ad735 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:12 +0900
Subject: block: always set bdev->bd_part

Till now, bdev->bd_part is set only if the bdev was for parts other
than part0.  This patch makes bdev->bd_part always set so that code
paths don't have to differenciate common handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        | 67 ++++++++++++++++++++++++---------------------------
 fs/partitions/check.c |  7 +-----
 2 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 57d57264285..c3fa19bd64d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,14 +540,6 @@ EXPORT_SYMBOL(bd_release);
  *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  */
 
-static struct kobject *bdev_get_kobj(struct block_device *bdev)
-{
-	if (bdev->bd_contains != bdev)
-		return kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-	else
-		return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj);
-}
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
@@ -596,7 +588,7 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
 	if (!bo->hdev)
 		goto fail_put_sdir;
 
-	bo->sdev = bdev_get_kobj(bdev);
+	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 	if (!bo->sdev)
 		goto fail_put_hdev;
 
@@ -919,7 +911,6 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
 
 static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
-	struct module *owner = NULL;
 	struct gendisk *disk;
 	struct hd_struct *part = NULL;
 	int ret;
@@ -941,25 +932,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 
 	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
+
 	lock_kernel();
+
 	disk = get_gendisk(bdev->bd_dev, &partno);
-	if (!disk) {
-		unlock_kernel();
-		bdput(bdev);
-		return ret;
-	}
-	owner = disk->fops->owner;
+	if (!disk)
+		goto out_unlock_kernel;
+	part = disk_get_part(disk, partno);
+	if (!part)
+		goto out_unlock_kernel;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
+		bdev->bd_part = part;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev->bd_inode, file);
 				if (ret)
-					goto out_first;
+					goto out_clear;
 			}
 			if (!bdev->bd_openers) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -975,31 +968,32 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
 			if (!whole)
-				goto out_first;
+				goto out_clear;
 			BUG_ON(for_part);
 			ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
 			if (ret)
-				goto out_first;
+				goto out_clear;
 			bdev->bd_contains = whole;
-			part = disk_get_part(disk, partno);
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !part || !part->nr_sects) {
 				ret = -ENXIO;
-				goto out_first;
+				goto out_clear;
 			}
-			bdev->bd_part = part;
 			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
 		}
 	} else {
+		disk_put_part(part);
 		put_disk(disk);
-		module_put(owner);
+		module_put(disk->fops->owner);
+		part = NULL;
+		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
 				if (ret)
-					goto out;
+					goto out_unlock_bdev;
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
@@ -1012,20 +1006,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	unlock_kernel();
 	return 0;
 
-out_first:
+ out_clear:
 	bdev->bd_disk = NULL;
+	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, 1);
 	bdev->bd_contains = NULL;
-	put_disk(disk);
-	disk_put_part(part);
-	module_put(owner);
-out:
+ out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
+ out_unlock_kernel:
 	unlock_kernel();
-	if (ret)
-		bdput(bdev);
+
+	disk_put_part(part);
+	if (disk)
+		module_put(disk->fops->owner);
+	put_disk(disk);
+	bdput(bdev);
+
 	return ret;
 }
 
@@ -1110,11 +1108,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 
 		put_disk(disk);
 		module_put(owner);
-
-		if (bdev->bd_contains != bdev) {
-			disk_put_part(bdev->bd_part);
-			bdev->bd_part = NULL;
-		}
+		disk_put_part(bdev->bd_part);
+		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 		if (bdev != bdev->bd_contains)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f0f604950ff..87298c0fc8c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -134,12 +134,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int partno = 0;
-
-	if (bdev->bd_part)
-		partno = bdev->bd_part->partno;
-
-	return disk_name(bdev->bd_disk, partno, buf);
+	return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
 }
 
 EXPORT_SYMBOL(bdevname);
-- 
cgit v1.2.3


From eddb2e26b5ee3c5da68ba4bf1921ba20e2097bff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:13 +0900
Subject: block: kill GENHD_FL_FAIL and use part0->make_it_fail

GENHD_FL_FAIL for disk is what make_it_fail is for parts.  Kill it and
use part0->make_it_fail.  Sysfs node handling is unified too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 87298c0fc8c..60592d9f43b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t part_fail_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_fail_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 
 	return sprintf(buf, "%d\n", p->make_it_fail);
 }
 
-static ssize_t part_fail_store(struct device *dev,
-			       struct device_attribute *attr,
-			       const char *buf, size_t count)
+ssize_t part_fail_store(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t count)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	int i;
-- 
cgit v1.2.3


From 074a7aca7afa6f230104e8e65eba3420263714a5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:14 +0900
Subject: block: move stats from disk to part0

Move stats related fields - stamp, in_flight, dkstats - from disk to
part0 and unify stat handling such that...

* part_stat_*() now updates part0 together if the specified partition
  is not part0.  ie. part_stat_*() are now essentially all_stat_*().

* {disk|all}_stat_*() are gone.

* part_round_stats() is updated similary.  It handles part0 stats
  automatically and disk_round_stats() is killed.

* part_{inc|dec}_in_fligh() is implemented which automatically updates
  part0 stats for parts other than part0.

* disk_map_sector_rcu() is updated to return part0 if no part matches.
  Combined with the above changes, this makes NULL special case
  handling in callers unnecessary.

* Separate stats show code paths for disk are collapsed into part
  stats show code paths.

* Rename disk_stat_lock/unlock() to part_stat_lock/unlock()

While at it, reposition stat handling macros a bit and add missing
parentheses around macro parameters.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 60592d9f43b..f517869e8d1 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -210,15 +210,15 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
-static ssize_t part_stat_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	int cpu;
 
-	cpu = disk_stat_lock();
+	cpu = part_stat_lock();
 	part_round_stats(cpu, p);
-	disk_stat_unlock();
+	part_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -575,8 +575,8 @@ void del_gendisk(struct gendisk *disk)
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
-	disk_stat_set_all(disk, 0);
-	disk->stamp = 0;
+	part_stat_set_all(&disk->part0, 0);
+	disk->part0.stamp = 0;
 
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
-- 
cgit v1.2.3


From 540eed5637b766bb1e881ef744c42617760b4815 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:15 +0900
Subject: block: make partition array dynamic

disk->__part used to be statically allocated to the maximum possible
number of partitions.  This patch makes partition array allocation
dynamic.  The added overhead is minimal as only real change is one
memory dereference changed to RCU one.  This saves both a bit of
memory and cpu cycles iterating through unoccupied slots and makes
increasing partition limit easier.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f517869e8d1..772b2ed8d23 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,14 +312,18 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 
 void delete_partition(struct gendisk *disk, int partno)
 {
+	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
 
-	part = disk->__part[partno];
+	if (partno >= ptbl->len)
+		return;
+
+	part = ptbl->part[partno];
 	if (!part)
 		return;
 
 	blk_free_devt(part_devt(part));
-	rcu_assign_pointer(disk->__part[partno], NULL);
+	rcu_assign_pointer(ptbl->part[partno], NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
@@ -341,10 +345,16 @@ int add_partition(struct gendisk *disk, int partno,
 	dev_t devt = MKDEV(0, 0);
 	struct device *ddev = disk_to_dev(disk);
 	struct device *pdev;
+	struct disk_part_tbl *ptbl;
 	const char *dname;
 	int err;
 
-	if (disk->__part[partno])
+	err = disk_expand_part_tbl(disk, partno);
+	if (err)
+		return err;
+	ptbl = disk->part_tbl;
+
+	if (ptbl->part[partno])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -398,7 +408,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	/* everything is up and running, commence */
 	INIT_RCU_HEAD(&p->rcu_head);
-	rcu_assign_pointer(disk->__part[partno], p);
+	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
 	if (!ddev->uevent_suppress)
@@ -487,7 +497,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 	struct parsed_partitions *state;
-	int p, res;
+	int p, highest, res;
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
@@ -511,6 +521,17 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
+	/* Detect the highest partition number and preallocate
+	 * disk->part_tbl.  This is an optimization and not strictly
+	 * necessary.
+	 */
+	for (p = 1, highest = 0; p < state->limit; p++)
+		if (state->parts[p].size)
+			highest = p;
+
+	disk_expand_part_tbl(disk, highest);
+
+	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
 		sector_t from = state->parts[p].from;
-- 
cgit v1.2.3


From 689d6fac40b41c7bf154f362deaf442548e4dc81 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:16 +0900
Subject: block: replace @ext_minors with GENHD_FL_EXT_DEVT

With previous changes, it's meaningless to limit the number of
partitions.  Replace @ext_minors with GENHD_FL_EXT_DEVT such that
setting the flag allows the disk to have maximum number of allowed
partitions (only limited by the number of entries in parsed_partitions
as determined by MAX_PART constant).

This kills not-too-pretty alloc_disk_ext[_node]() functions and makes
@minors parameter to alloc_disk[_node]() unnecessary.  The parameter
is left alone to avoid disturbing the users.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8..98dbe1a8452 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
  * add_gd_partition adds a partitions details to the devices partition
  * description.
  */
-enum { MAX_PART = 256 };
-
 struct parsed_partitions {
 	char name[BDEVNAME_SIZE];
 	struct {
 		sector_t from;
 		sector_t size;
 		int flags;
-	} parts[MAX_PART];
+	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
 };
-- 
cgit v1.2.3


From 3e1a7ff8a0a7b948f2684930166954f9e8e776fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:17 +0900
Subject: block: allow disk to have extended device number

Now that disk and partition handlings are mostly unified, it's easy to
allow disk to have extended device number.  This patch makes
add_disk() use extended device number if disk->minors is zero.  Both
sd and ide-disk are updated to use this.

* sd_format_disk_name() is implemented which can generically determine
  the drive name.  This removes disk number restriction stemming from
  limited device names.

* If sd index goes over SD_MAX_DISKS (which can be increased now BTW),
  sd simply doesn't initialize minors letting block layer choose
  extended device number.

* If CONFIG_DEBUG_EXT_DEVT is set, both sd and ide-disk always set
  minors to 0 and use extended device numbers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 772b2ed8d23..0e411603fdf 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -593,6 +593,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_exit(&piter);
 
 	invalidate_partition(disk, 0);
+	blk_free_devt(disk_to_dev(disk)->devt);
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
-- 
cgit v1.2.3


From c7c22e4d5c1fdebfac4dba76de7d0338c2b0d832 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 13 Sep 2008 20:26:01 +0200
Subject: block: add support for IO CPU affinity

This patch adds support for controlling the IO completion CPU of
either all requests on a queue, or on a per-request basis. We export
a sysfs variable (rq_affinity) which, if set, migrates completions
of requests to the CPU that originally submitted it. A bio helper
(bio_set_completion_cpu()) is also added, so that queuers can ask
for completion on that specific CPU.

In testing, this has been show to cut the system time by as much
as 20-40% on synthetic workloads where CPU affinity is desired.

This requires a little help from the architecture, so it'll only
work as designed for archs that are using the new generic smp
helper infrastructure.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index bee4deca774..6a637b5c24b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -111,6 +111,7 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
+	bio->bi_comp_cpu = -1;
 	atomic_set(&bio->bi_cnt, 1);
 }
 
-- 
cgit v1.2.3


From a3bce90edd8f6cafe3f63b1a943800792e830178 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 16:17:05 +0900
Subject: block: add gfp_mask argument to blk_rq_map_user and
 blk_rq_map_user_iov

Currently, blk_rq_map_user and blk_rq_map_user_iov always do
GFP_KERNEL allocation.

This adds gfp_mask argument to blk_rq_map_user and blk_rq_map_user_iov
so sg can use it (sg always does GFP_ATOMIC allocation).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 6a637b5c24b..3d2e9ad2472 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -558,13 +558,14 @@ int bio_uncopy_user(struct bio *bio)
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
 struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
-			      int iov_count, int write_to_vm)
+			      int iov_count, int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct bio_vec *bvec;
@@ -587,12 +588,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 		len += iov[i].iov_len;
 	}
 
-	bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL);
+	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
@@ -605,7 +606,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 		if (bytes > len)
 			bytes = len;
 
-		page = alloc_page(q->bounce_gfp | GFP_KERNEL);
+		page = alloc_page(q->bounce_gfp | gfp_mask);
 		if (!page) {
 			ret = -ENOMEM;
 			break;
@@ -647,26 +648,27 @@ out_bmd:
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
 struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
-			  unsigned int len, int write_to_vm)
+			  unsigned int len, int write_to_vm, gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+	return bio_copy_user_iov(q, &iov, 1, write_to_vm, gfp_mask);
 }
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
 				      struct sg_iovec *iov, int iov_count,
-				      int write_to_vm)
+				      int write_to_vm, gfp_t gfp_mask)
 {
 	int i, j;
 	int nr_pages = 0;
@@ -692,12 +694,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
-	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
 	if (!pages)
 		goto out;
 
@@ -776,19 +778,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm)
+			 unsigned long uaddr, unsigned int len, int write_to_vm,
+			 gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
 }
 
 /**
@@ -798,18 +802,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
 			     struct sg_iovec *iov, int iov_count,
-			     int write_to_vm)
+			     int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
-
+	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+				 gfp_mask);
 	if (IS_ERR(bio))
 		return bio;
 
-- 
cgit v1.2.3


From 152e283fdfea0cd11e297d982378b55937842dde Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 16:17:06 +0900
Subject: block: introduce struct rq_map_data to use reserved pages

This patch introduces struct rq_map_data to enable bio_copy_use_iov()
use reserved pages.

Currently, bio_copy_user_iov allocates bounce pages but
drivers/scsi/sg.c wants to allocate pages by itself and use
them. struct rq_map_data can be used to pass allocated pages to
bio_copy_user_iov.

The current users of bio_copy_user_iov simply passes NULL (they don't
want to use pre-allocated pages).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 58 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 3d2e9ad2472..a2f072647cd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -439,16 +439,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 
 struct bio_map_data {
 	struct bio_vec *iovecs;
-	int nr_sgvecs;
 	struct sg_iovec *sgvecs;
+	int nr_sgvecs;
+	int is_our_pages;
 };
 
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     struct sg_iovec *iov, int iov_count)
+			     struct sg_iovec *iov, int iov_count,
+			     int is_our_pages)
 {
 	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
 	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
 	bmd->nr_sgvecs = iov_count;
+	bmd->is_our_pages = is_our_pages;
 	bio->bi_private = bmd;
 }
 
@@ -483,7 +486,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
 }
 
 static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-			  struct sg_iovec *iov, int iov_count, int uncopy)
+			  struct sg_iovec *iov, int iov_count, int uncopy,
+			  int do_free_page)
 {
 	int ret = 0, i;
 	struct bio_vec *bvec;
@@ -526,7 +530,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 			}
 		}
 
-		if (uncopy)
+		if (do_free_page)
 			__free_page(bvec->bv_page);
 	}
 
@@ -545,7 +549,8 @@ int bio_uncopy_user(struct bio *bio)
 	struct bio_map_data *bmd = bio->bi_private;
 	int ret;
 
-	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);
+	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1,
+			     bmd->is_our_pages);
 
 	bio_free_map_data(bmd);
 	bio_put(bio);
@@ -555,6 +560,7 @@ int bio_uncopy_user(struct bio *bio)
 /**
  *	bio_copy_user_iov	-	copy user data to bio
  *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
@@ -564,8 +570,10 @@ int bio_uncopy_user(struct bio *bio)
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
-			      int iov_count, int write_to_vm, gfp_t gfp_mask)
+struct bio *bio_copy_user_iov(struct request_queue *q,
+			      struct rq_map_data *map_data,
+			      struct sg_iovec *iov, int iov_count,
+			      int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct bio_vec *bvec;
@@ -600,13 +608,26 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 	bio->bi_rw |= (!write_to_vm << BIO_RW);
 
 	ret = 0;
+	i = 0;
 	while (len) {
-		unsigned int bytes = PAGE_SIZE;
+		unsigned int bytes;
+
+		if (map_data)
+			bytes = 1U << (PAGE_SHIFT + map_data->page_order);
+		else
+			bytes = PAGE_SIZE;
 
 		if (bytes > len)
 			bytes = len;
 
-		page = alloc_page(q->bounce_gfp | gfp_mask);
+		if (map_data) {
+			if (i == map_data->nr_entries) {
+				ret = -ENOMEM;
+				break;
+			}
+			page = map_data->pages[i++];
+		} else
+			page = alloc_page(q->bounce_gfp | gfp_mask);
 		if (!page) {
 			ret = -ENOMEM;
 			break;
@@ -625,16 +646,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 	 * success
 	 */
 	if (!write_to_vm) {
-		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0);
+		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
 		if (ret)
 			goto cleanup;
 	}
 
-	bio_set_map_data(bmd, bio, iov, iov_count);
+	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
 	return bio;
 cleanup:
-	bio_for_each_segment(bvec, bio, i)
-		__free_page(bvec->bv_page);
+	if (!map_data)
+		bio_for_each_segment(bvec, bio, i)
+			__free_page(bvec->bv_page);
 
 	bio_put(bio);
 out_bmd:
@@ -645,6 +667,7 @@ out_bmd:
 /**
  *	bio_copy_user	-	copy user data to bio
  *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
@@ -654,15 +677,16 @@ out_bmd:
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
-			  unsigned int len, int write_to_vm, gfp_t gfp_mask)
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
+			  unsigned long uaddr, unsigned int len,
+			  int write_to_vm, gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_copy_user_iov(q, &iov, 1, write_to_vm, gfp_mask);
+	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
 }
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
@@ -1028,7 +1052,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 	bio->bi_private = bmd;
 	bio->bi_end_io = bio_copy_kern_endio;
 
-	bio_set_map_data(bmd, bio, &iov, 1);
+	bio_set_map_data(bmd, bio, &iov, 1, 1);
 	return bio;
 cleanup:
 	bio_for_each_segment(bvec, bio, i)
-- 
cgit v1.2.3


From 4d8ab62e087d9300883b82c2662e73e6eef803a3 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 15:05:57 +0900
Subject: bio: convert bio_copy_kern to use bio_copy_user

bio_copy_kern and bio_copy_user are very similar. This converts
bio_copy_kern to use bio_copy_user.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 54 ++++--------------------------------------------------
 1 file changed, 4 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index a2f072647cd..9d68ddb89b7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -995,48 +995,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 			  gfp_t gfp_mask, int reading)
 {
-	unsigned long kaddr = (unsigned long)data;
-	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = kaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
 	struct bio *bio;
 	struct bio_vec *bvec;
-	struct bio_map_data *bmd;
-	int i, ret;
-	struct sg_iovec iov;
-
-	iov.iov_base = data;
-	iov.iov_len = len;
-
-	bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask);
-	if (!bmd)
-		return ERR_PTR(-ENOMEM);
-
-	ret = -ENOMEM;
-	bio = bio_alloc(gfp_mask, nr_pages);
-	if (!bio)
-		goto out_bmd;
-
-	while (len) {
-		struct page *page;
-		unsigned int bytes = PAGE_SIZE;
-
-		if (bytes > len)
-			bytes = len;
-
-		page = alloc_page(q->bounce_gfp | gfp_mask);
-		if (!page) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
-
-		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
-			ret = -EINVAL;
-			goto cleanup;
-		}
+	int i;
 
-		len -= bytes;
-	}
+	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
 
 	if (!reading) {
 		void *p = data;
@@ -1049,20 +1014,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 		}
 	}
 
-	bio->bi_private = bmd;
 	bio->bi_end_io = bio_copy_kern_endio;
 
-	bio_set_map_data(bmd, bio, &iov, 1, 1);
 	return bio;
-cleanup:
-	bio_for_each_segment(bvec, bio, i)
-		__free_page(bvec->bv_page);
-
-	bio_put(bio);
-out_bmd:
-	bio_free_map_data(bmd);
-
-	return ERR_PTR(ret);
 }
 
 /*
-- 
cgit v1.2.3


From 818827669d85b84241696ffef2de485db46b0b5e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 16:20:19 +0900
Subject: block: make blk_rq_map_user take a NULL user-space buffer

This patch changes blk_rq_map_user to accept a NULL user-space buffer
with a READ command if rq_map_data is not NULL. Thus a caller can pass
page frames to lk_rq_map_user to just set up a request and bios with
page frames propely. bio_uncopy_user (called via blk_rq_unmap_user)
doesn't copy data to user space with such request.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 9d68ddb89b7..355302985e2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -547,11 +547,11 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 int bio_uncopy_user(struct bio *bio)
 {
 	struct bio_map_data *bmd = bio->bi_private;
-	int ret;
-
-	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1,
-			     bmd->is_our_pages);
+	int ret = 0;
 
+	if (!bio_flagged(bio, BIO_NULL_MAPPED))
+		ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+				     bmd->nr_sgvecs, 1, bmd->is_our_pages);
 	bio_free_map_data(bmd);
 	bio_put(bio);
 	return ret;
-- 
cgit v1.2.3


From 0c002c2f74e10baa9021d3ecc50585c6eafea568 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:20 -0600
Subject: Wrapper for lower-level revalidate_disk routines.

This is a wrapper for the lower-level revalidate_disk call-backs such
as sd_revalidate_disk(). It allows us to perform pre and post
operations when calling them.

We will use this wrapper in a later patch to adjust block device sizes
after an online resize (a _post_ operation).

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index c3fa19bd64d..4eeb69a8873 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -852,6 +852,27 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk
+ *                   call-back
+ *
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+	int ret = 0;
+
+	if (disk->fops->revalidate_disk)
+		ret = disk->fops->revalidate_disk(disk);
+
+	return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
+
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
-- 
cgit v1.2.3


From c3279d1454cdfed02a557d789d8a6d08ab4cbe70 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:25 -0600
Subject: Adjust block device size after an online resize of a disk.

The revalidate_disk routine now checks if a disk has been resized by
comparing the gendisk capacity to the bdev inode size.  If they are
different (usually because the disk has been resized underneath the kernel)
the bdev inode size is adjusted to match the capacity.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4eeb69a8873..b721955d382 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -852,6 +852,34 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * check_disk_size_change - checks for disk size change and adjusts
+ *                          bdev size.
+ *
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+	loff_t disk_size, bdev_size;
+
+	disk_size = (loff_t)get_capacity(disk) << 9;
+	bdev_size = i_size_read(bdev->bd_inode);
+	if (disk_size != bdev_size) {
+		char name[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, name);
+		printk(KERN_INFO
+		       "%s: detected capacity change from %lld to %lld\n",
+		       name, bdev_size, disk_size);
+		i_size_write(bdev->bd_inode, disk_size);
+	}
+}
+EXPORT_SYMBOL(check_disk_size_change);
+
 /**
  * revalidate_disk - wrapper for lower-level driver's revalidate_disk
  *                   call-back
@@ -864,11 +892,20 @@ EXPORT_SYMBOL(open_by_devnum);
  */
 int revalidate_disk(struct gendisk *disk)
 {
+	struct block_device *bdev;
 	int ret = 0;
 
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
 
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		return ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	check_disk_size_change(disk, bdev);
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
 	return ret;
 }
 EXPORT_SYMBOL(revalidate_disk);
-- 
cgit v1.2.3


From 9bc3ffbfbdf71fefda8a261ef8d6fdc388a29b42 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:30 -0600
Subject: Check for device resize when rescanning partitions

Check for device resize in the rescan_partitions() routine. If the device
has been resized, the bdev size is set to match. The rescan_partitions()
routine is called when opening the device and when calling the
BLKRRPART ioctl.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0e411603fdf..7408227c49c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -504,7 +504,6 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	res = invalidate_partition(disk, 0);
 	if (res)
 		return res;
-	bdev->bd_invalidated = 0;
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
@@ -513,6 +512,8 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
 	if (IS_ERR(state))	/* I/O error reading the partition table */
-- 
cgit v1.2.3


From 56ade44b46780fa291fa68b824f1dafdcb11b0ca Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:40 -0600
Subject: Added flush_disk to factor out common buffer cache flushing code.

We need to be able to flush the buffer cache for for more than
just when a disk is changed, so we factor out common cache flush code
in check_disk_change() to an internal flush_disk() routine.  This
routine will then be used for both disk changes and disk resizes (in a
later patch).

Include the disk name in the text indicating that there are busy
inodes on the device and increase the KERN severity of the message.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b721955d382..33650fc537c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -852,6 +852,32 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev)
+{
+	if (__invalidate_device(bdev)) {
+		char name[BDEVNAME_SIZE] = "";
+
+		if (bdev->bd_disk)
+			disk_name(bdev->bd_disk, 0, name);
+		printk(KERN_WARNING "VFS: busy inodes on changed media or "
+		       "resized disk %s\n", name);
+	}
+
+	if (!bdev->bd_disk)
+		return;
+	if (disk_partitionable(bdev->bd_disk))
+		bdev->bd_invalidated = 1;
+}
+
 /**
  * check_disk_size_change - checks for disk size change and adjusts
  *                          bdev size.
@@ -929,13 +955,9 @@ int check_disk_change(struct block_device *bdev)
 	if (!bdops->media_changed(bdev->bd_disk))
 		return 0;
 
-	if (__invalidate_device(bdev))
-		printk("VFS: busy inodes on changed media.\n");
-
+	flush_disk(bdev);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (disk_partitionable(bdev->bd_disk))
-		bdev->bd_invalidated = 1;
 	return 1;
 }
 
-- 
cgit v1.2.3


From 608aeef17a91747d6303de4df5e2c2e6899a95e8 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:45 -0600
Subject: Call flush_disk() after detecting an online resize.

We call flush_disk() to make sure the buffer cache for the disk is
flushed after a disk resize. There are two resize cases, growing and
shrinking. Given that users can shrink/then grow a disk before
revalidate_disk() is called, we treat the grow case identically to
shrinking. We need to flush the buffer cache after an online shrink
because, as James Bottomley puts it,

     The two use cases for shrinking I can see are

     1. planned: the fs is already shrunk to within the new boundaries
        and all data is relocated, so invalidate is fine (any dirty
        buffers that might exist in the shrunk region are there only
        because they were relocated but not yet written to their
        original location).
     2. unplanned:  In this case, the fs is probably toast, so whether
        we invalidate or not isn't going to make a whole lot of
        difference; it's still going to try to read or write from
        sectors beyond the new size and get I/O errors.

Immediately invalidating shrunk disks will cause errors for outstanding
I/Os for reads/write beyond the new end of the disk to be generated
earlier then if we waited for the normal buffer cache operation. It also
removes a potential security hole where we might keep old data around
from beyond the end of the shrunk disk if the disk was not invalidated.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 33650fc537c..57e2786dd2a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -902,6 +902,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 		       "%s: detected capacity change from %lld to %lld\n",
 		       name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
+		flush_disk(bdev);
 	}
 }
 EXPORT_SYMBOL(check_disk_size_change);
-- 
cgit v1.2.3


From 0a0d96b03a1f3bfd6bc3ea08008699e8e59fccd9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 11 Sep 2008 13:17:37 +0200
Subject: block: add bio_kmalloc()

Not all callers need (or want!) the mempool backing guarentee, it
essentially means that you can only use bio_alloc() for short allocations
and not for preallocating some bio's at setup or init time.

So add bio_kmalloc() which does the same thing as bio_alloc(), except
it just uses kmalloc() as the backing instead of the bio mempools.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 355302985e2..e56e7685af9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
 	struct bio_vec *bvl;
 
 	/*
-	 * see comment near bvec_array define!
+	 * If 'bs' is given, lookup the pool and do the mempool alloc.
+	 * If not, this is a bio_kmalloc() allocation and just do a
+	 * kzalloc() for the exact number of vecs right away.
 	 */
-	switch (nr) {
-		case   1        : *idx = 0; break;
-		case   2 ...   4: *idx = 1; break;
-		case   5 ...  16: *idx = 2; break;
-		case  17 ...  64: *idx = 3; break;
-		case  65 ... 128: *idx = 4; break;
-		case 129 ... BIO_MAX_PAGES: *idx = 5; break;
+	if (bs) {
+		/*
+		 * see comment near bvec_array define!
+		 */
+		switch (nr) {
+		case 1:
+			*idx = 0;
+			break;
+		case 2 ... 4:
+			*idx = 1;
+			break;
+		case 5 ... 16:
+			*idx = 2;
+			break;
+		case 17 ... 64:
+			*idx = 3;
+			break;
+		case 65 ... 128:
+			*idx = 4;
+			break;
+		case 129 ... BIO_MAX_PAGES:
+			*idx = 5;
+			break;
 		default:
 			return NULL;
-	}
-	/*
-	 * idx now points to the pool we want to allocate from
-	 */
+		}
 
-	bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-	if (bvl)
-		memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+		/*
+		 * idx now points to the pool we want to allocate from
+		 */
+		bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+		if (bvl)
+			memset(bvl, 0,
+				bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+	} else
+		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
 
 	return bvl;
 }
@@ -107,6 +128,12 @@ static void bio_fs_destructor(struct bio *bio)
 	bio_free(bio, fs_bio_set);
 }
 
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	kfree(bio->bi_io_vec);
+	kfree(bio);
+}
+
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
@@ -119,19 +146,25 @@ void bio_init(struct bio *bio)
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
- * @bs:		the bio_set to allocate from
+ * @bs:		the bio_set to allocate from. If %NULL, just use kmalloc
  *
  * Description:
- *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
+ *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
  *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
+ *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   fall back to just using @kmalloc to allocate the required memory.
  *
  *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure.
+ *   bio_set structure, or @kmalloc if none given.
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-	struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	struct bio *bio;
+
+	if (bs)
+		bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	else
+		bio = kmalloc(sizeof(*bio), gfp_mask);
 
 	if (likely(bio)) {
 		struct bio_vec *bvl = NULL;
@@ -142,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
-				mempool_free(bio, bs->bio_pool);
+				if (bs)
+					mempool_free(bio, bs->bio_pool);
+				else
+					kfree(bio);
 				bio = NULL;
 				goto out;
 			}
@@ -165,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 	return bio;
 }
 
+/*
+ * Like bio_alloc(), but doesn't use a mempool backing. This means that
+ * it CAN fail, but while bio_alloc() can only be used for allocations
+ * that have a short (finite) life span, bio_kmalloc() should be used
+ * for more permanent bio allocations (like allocating some bio's for
+ * initalization or setup purposes).
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+
+	if (bio)
+		bio->bi_destructor = bio_kmalloc_destructor;
+
+	return bio;
+}
+
 void zero_fill_bio(struct bio *bio)
 {
 	unsigned long flags;
@@ -1349,6 +1402,7 @@ static int __init init_bio(void)
 subsys_initcall(init_bio);
 
 EXPORT_SYMBOL(bio_alloc);
+EXPORT_SYMBOL(bio_kmalloc);
 EXPORT_SYMBOL(bio_put);
 EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
-- 
cgit v1.2.3


From 9c02f2b02e29a2244e36c6e1f246080d8afc6cff Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 18 Sep 2008 09:31:53 -0700
Subject: block: cleanup some of the integrity stuff in blkdev.h

Don't put functions that are only used in fs/bio-integrity.c in
blkdev.h, it's much cleaner to just keep it in there. Also kill
completely unused bdev_get_tag_size()

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe..ba4ada08564 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -150,6 +150,29 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+static struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
+{
+	return bdev->bd_disk->integrity;
+}
+
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return 0;
+
+	if (rw == READ && bi->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return 1;
+
+	if (rw == WRITE && bi->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return 1;
+
+	return 0;
+}
+
 /**
  * bio_integrity_enabled - Check whether integrity can be passed
  * @bio:	bio to check
@@ -313,6 +336,14 @@ static void bio_integrity_generate(struct bio *bio)
 	}
 }
 
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+	if (bi)
+		return bi->tuple_size;
+
+	return 0;
+}
+
 /**
  * bio_integrity_prep - Prepare bio for integrity I/O
  * @bio:	bio to prepare
-- 
cgit v1.2.3


From b04accc425d52ca59699290661e0dfd09b0feeeb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 2 Oct 2008 12:53:22 +0200
Subject: block: revert part of d7533ad0e132f92e75c1b2eb7c26387b25a583c1

We need bdev_get_integrity() to support the pending md/dm patches.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index ba4ada08564..6e28dcdd23a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -150,11 +150,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
-static struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
-	return bdev->bd_disk->integrity;
-}
-
 static int bdev_integrity_enabled(struct block_device *bdev, int rw)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bdev);
-- 
cgit v1.2.3


From 74aa8c2cc010035a7eef2b4ca4d6430e0dae206a Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Oct 2008 03:38:37 -0400
Subject: block: Introduce integrity data ownership flag

A filesystem might supply its own integrity metadata.  Introduce a
flag that indicates whether the filesystem or the block layer owns the
integrity buffer.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 6e28dcdd23a..19caf7c962a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	BUG_ON(bip == NULL);
 
 	/* A cloned bio doesn't own the integrity metadata */
-	if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
 	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
-- 
cgit v1.2.3


From ad3316bf4eeb53c89164f759767f911072b56203 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Oct 2008 22:42:53 -0400
Subject: block: Find bio sector offset given idx and offset

Helper function to find the sector offset in a bio given bvec index
and page offset.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index e56e7685af9..a5af5809f56 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1300,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	return bp;
 }
 
+/**
+ *      bio_sector_offset - Find hardware sector offset in bio
+ *      @bio:           bio to inspect
+ *      @index:         bio_vec index
+ *      @offset:        offset in bv_page
+ *
+ *      Return the number of hardware sectors between beginning of bio
+ *      and an end point indicated by a bio_vec index and an offset
+ *      within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+			   unsigned int offset)
+{
+	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	struct bio_vec *bv;
+	sector_t sectors;
+	int i;
+
+	sectors = 0;
+
+	if (index >= bio->bi_idx)
+		index = bio->bi_vcnt - 1;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		if (i == index) {
+			if (offset > bv->bv_offset)
+				sectors += (offset - bv->bv_offset) / sector_sz;
+			break;
+		}
+
+		sectors += bv->bv_len / sector_sz;
+	}
+
+	return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
 
 /*
  * create memory pools for biovec's in a bio_set.
-- 
cgit v1.2.3


From 6feef531f55cf4a20fd9eb39f5352e5745203603 Mon Sep 17 00:00:00 2001
From: Denis ChengRq <crquan@gmail.com>
Date: Thu, 9 Oct 2008 08:57:05 +0200
Subject: block: mark bio_split_pool static

Since all bio_split calls refer the same single bio_split_pool, the bio_split
function can use bio_split_pool directly instead of the mempool_t parameter;

then the mempool_t parameter can be removed from bio_split param list, and
bio_split_pool is only referred in fs/bio.c file, can be marked static.

Signed-off-by: Denis ChengRq <crquan@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index a5af5809f56..77a55bcceed 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
 
 static struct kmem_cache *bio_slab __read_mostly;
 
-mempool_t *bio_split_pool __read_mostly;
+static mempool_t *bio_split_pool __read_mostly;
 
 /*
  * if you change this list, also change bvec_alloc or things will
@@ -1256,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
  * split a bio - only worry about a bio with a single page
  * in it's iovec
  */
-struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
-	struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
+	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
 
 	if (!bp)
 		return bp;
@@ -1292,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio2.bi_end_io = bio_pair_end_2;
 
 	bp->bio1.bi_private = bi;
-	bp->bio2.bi_private = pool;
+	bp->bio2.bi_private = bio_split_pool;
 
 	if (bio_integrity(bi))
 		bio_integrity_split(bi, bp, first_sectors);
@@ -1455,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_copy_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_split_pool);
 EXPORT_SYMBOL(bio_copy_user);
 EXPORT_SYMBOL(bio_uncopy_user);
 EXPORT_SYMBOL(bioset_create);
-- 
cgit v1.2.3


From 57d1b5366f46fe434e565b710baf683daff78dd8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 9 Oct 2008 10:42:38 +0200
Subject: block_dev: fix kernel-doc in new functions

Fix kernel-doc in new functions:

Error(mmotm-2008-1002-1617//fs/block_dev.c:895): duplicate section name 'Description'
Error(mmotm-2008-1002-1617//fs/block_dev.c:924): duplicate section name 'Description'
Warning(mmotm-2008-1002-1617//fs/block_dev.c:1282): No description found for parameter 'pathname'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 57e2786dd2a..d84f0469a01 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -879,9 +879,7 @@ static void flush_disk(struct block_device *bdev)
 }
 
 /**
- * check_disk_size_change - checks for disk size change and adjusts
- *                          bdev size.
- *
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
  * @disk: struct gendisk to check
  * @bdev: struct bdev to adjust.
  *
@@ -908,9 +906,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 EXPORT_SYMBOL(check_disk_size_change);
 
 /**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk
- *                   call-back
- *
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
  * @disk: struct gendisk to be revalidated
  *
  * This routine is a wrapper for lower-level driver's revalidate_disk
@@ -1266,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 
 /**
  * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:	special file representing the block device
  *
- * @path:	special file representing the block device
- *
- * Get a reference to the blockdevice at @path in the current
+ * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */
-- 
cgit v1.2.3


From efc968d450e013049a662d22727cf132618dcb2f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 9 Oct 2008 14:04:54 -0700
Subject: Don't allow splice() to files opened with O_APPEND

This is debatable, but while we're debating it, let's disallow the
combination of splice and an O_APPEND destination.

It's not entirely clear what the semantics of O_APPEND should be, and
POSIX apparently expects pwrite() to ignore O_APPEND, for example.  So
we could make up any semantics we want, including the old ones.

But Miklos convinced me that we should at least give it some thought,
and that accepting writes at arbitrary offsets is wrong at least for
IS_APPEND() files (which always have O_APPEND set, even if the reverse
isn't true: you can obviously have O_APPEND set on a regular file).

So disallow O_APPEND entirely for now.  I doubt anybody cares, and this
way we have one less gray area to worry about.

Reported-and-argued-for-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Jens Axboe <ens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 1bbc6f4bb09..a1e701c2715 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
+	if (unlikely(out->f_flags & O_APPEND))
+		return -EINVAL;
+
 	ret = rw_verify_area(WRITE, out, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
-- 
cgit v1.2.3


From 73f6aa4d44ab6157badc456ddfa05b31e58de5f0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 10 Oct 2008 17:28:29 +1100
Subject: Fix barrier fail detection in XFS

Currently we disable barriers as soon as we get a buffer in xlog_iodone
that has the XBF_ORDERED flag cleared.  But this can be the case not only
for buffers where the barrier failed, but also the first buffer of a
split log write in case of a log wraparound.  Due to the disabled
barriers we can easily get directory corruption on unclean shutdowns.
So instead of using this check add a new buffer flag for failed barrier
writes.

This is a regression vs 2.6.26 caused by patch to use the right macro
to check for the ORDERED flag, as we previously got true returned for
every buffer.

Thanks to Toei Rei for reporting the bug.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: David Chinner <david@fromorbit.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/linux-2.6/xfs_buf.c | 3 ++-
 fs/xfs/linux-2.6/xfs_buf.h | 8 ++++++++
 fs/xfs/xfs_log.c           | 7 ++++---
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 986061ae1b9..36d5fcd3f59 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1001,12 +1001,13 @@ xfs_buf_iodone_work(
 	 * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
 	 * ordered flag and reissue them.  Because we can't tell the higher
 	 * layers directly that they should not issue ordered I/O anymore, they
-	 * need to check if the ordered flag was cleared during I/O completion.
+	 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
 	 */
 	if ((bp->b_error == EOPNOTSUPP) &&
 	    (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
 		XB_TRACE(bp, "ordered_retry", bp->b_iodone);
 		bp->b_flags &= ~XBF_ORDERED;
+		bp->b_flags |= _XFS_BARRIER_FAILED;
 		xfs_buf_iorequest(bp);
 	} else if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index fe010995665..456519a088c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
 	 * modifications being lost.
 	 */
 	_XBF_PAGE_LOCKED = (1 << 22),
+
+	/*
+	 * If we try a barrier write, but it fails we have to communicate
+	 * this to the upper layers.  Unfortunately b_error gets overwritten
+	 * when the buffer is re-issued so we have to add another flag to
+	 * keep this information.
+	 */
+	_XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
 
 typedef enum {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 503ea89e8b9..0b02c644355 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1033,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
 	l = iclog->ic_log;
 
 	/*
-	 * If the ordered flag has been removed by a lower
-	 * layer, it means the underlyin device no longer supports
+	 * If the _XFS_BARRIER_FAILED flag was set by a lower
+	 * layer, it means the underlying device no longer supports
 	 * barrier I/O. Warn loudly and turn off barriers.
 	 */
-	if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) {
+	if (bp->b_flags & _XFS_BARRIER_FAILED) {
+		bp->b_flags &= ~_XFS_BARRIER_FAILED;
 		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		xfs_fs_cmn_err(CE_WARN, l->l_mp,
 				"xlog_iodone: Barriers are no longer supported"
-- 
cgit v1.2.3


From 03010a3350301baac2154fa66de925ae2981b7e3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 10 Oct 2008 20:02:48 -0400
Subject: ext4: Rename ext4dev to ext4

The ext4 filesystem is getting stable enough that it's time to drop
the "dev" prefix.  Also remove the requirement for the TEST_FILESYS
flag.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/Kconfig        | 88 ++++++++++++++++++++++++++++++++-----------------------
 fs/Makefile       |  2 +-
 fs/ext4/Makefile  | 10 +++----
 fs/ext4/acl.h     |  6 ++--
 fs/ext4/ext4_i.h  |  4 +--
 fs/ext4/file.c    |  2 +-
 fs/ext4/inode.c   |  2 +-
 fs/ext4/namei.c   |  6 ++--
 fs/ext4/super.c   | 63 ++++++++++++++++++++++++++-------------
 fs/ext4/symlink.c |  4 +--
 fs/ext4/xattr.c   |  8 ++---
 fs/ext4/xattr.h   |  8 ++---
 12 files changed, 119 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index abccb5dab9a..40183d94b68 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
-config EXT4DEV_FS
-	tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
 	select JBD2
 	select CRC16
 	help
-	  Ext4dev is a predecessor filesystem of the next generation
-	  extended fs ext4, based on ext3 filesystem code. It will be
-	  renamed ext4 fs later, once ext4dev is mature and stabilized.
+	  This is the next generation of the ext3 filesystem.
 
 	  Unlike the change from ext2 filesystem to ext3 filesystem,
-	  the on-disk format of ext4dev is not the same as ext3 any more:
-	  it is based on extent maps and it supports 48-bit physical block
-	  numbers. These combined on-disk format changes will allow
-	  ext4dev/ext4 to handle more than 16 TB filesystem volumes --
-	  a hard limit that ext3 cannot overcome without changing the
-	  on-disk format.
-
-	  Other than extent maps and 48-bit block numbers, ext4dev also is
-	  likely to have other new features such as persistent preallocation,
-	  high resolution time stamps, and larger file support etc.  These
-	  features will be added to ext4dev gradually.
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
+
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formating a new filesystem as an ext4
+	  filesystem initially.
 
 	  To compile this file system support as a module, choose M here. The
 	  module will be called ext4dev.
 
 	  If unsure, say N.
 
-config EXT4DEV_FS_XATTR
-	bool "Ext4dev extended attributes"
-	depends on EXT4DEV_FS
+config EXT4DEV_COMPAT
+	bool "Enable ext4dev compatibility"
+	depends on EXT4_FS
+	help
+	  Starting with 2.6.28, the name of the ext4 filesystem was
+	  renamed from ext4dev to ext4.  Unfortunately there are some
+	  lagecy userspace programs (such as klibc's fstype) have
+	  "ext4dev" hardcoded.  
+
+	  To enable backwards compatibility so that systems that are
+	  still expecting to mount ext4 filesystems using ext4dev,
+	  chose Y here.   This feature will go away by 2.6.31, so
+	  please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+	bool "Ext4 extended attributes"
+	depends on EXT4_FS
 	default y
 	help
 	  Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
 
 	  If unsure, say N.
 
-	  You need this for POSIX ACL support on ext4dev/ext4.
+	  You need this for POSIX ACL support on ext4.
 
-config EXT4DEV_FS_POSIX_ACL
-	bool "Ext4dev POSIX Access Control Lists"
-	depends on EXT4DEV_FS_XATTR
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS_XATTR
 	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
 
 	  If you don't know what Access Control Lists are, say N
 
-config EXT4DEV_FS_SECURITY
-	bool "Ext4dev Security Labels"
-	depends on EXT4DEV_FS_XATTR
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS_XATTR
 	help
 	  Security labels support alternative access control models
 	  implemented by security modules like SELinux.  This option
 	  enables an extended attribute handler for file security
-	  labels in the ext4dev/ext4 filesystem.
+	  labels in the ext4 filesystem.
 
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
@@ -240,22 +254,22 @@ config JBD2
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
-	  the ext4dev/ext4 filesystem, but it could also be used to add
+	  the ext4 filesystem, but it could also be used to add
 	  journal support to other file systems or block devices such
 	  as RAID or LVM.
 
-	  If you are using ext4dev/ext4, you need to say Y here. If you are not
-	  using ext4dev/ext4 then you will probably want to say N.
+	  If you are using ext4, you need to say Y here. If you are not
+	  using ext4 then you will probably want to say N.
 
 	  To compile this device as a module, choose M here. The module will be
-	  called jbd2.  If you are compiling ext4dev/ext4 into the kernel,
+	  called jbd2.  If you are compiling ext4 into the kernel,
 	  you cannot compile this code as a module.
 
 config JBD2_DEBUG
-	bool "JBD2 (ext4dev/ext4) debugging support"
+	bool "JBD2 (ext4) debugging support"
 	depends on JBD2 && DEBUG_FS
 	help
-	  If you are using the ext4dev/ext4 journaled file system (or
+	  If you are using the ext4 journaled file system (or
 	  potentially any other filesystem/device using JBD2), this option
 	  allows you to enable debugging output while the system is running,
 	  in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
 	tristate
-	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
-	default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
-	default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
+	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+	default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
+	default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
 
 config REISERFS_FS
 	tristate "Reiserfs support"
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff1..de404b00eb0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,7 +69,7 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4DEV_FS)	+= ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4dev
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2..a8ff003a00f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
 # Makefile for the linux ext4-filesystem routines.
 #
 
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		   ext4_jbd2.o migrate.o mballoc.o
 
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)	+= xattr_security.o
+ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 4c9948f69f8..cb45257a246 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,7 +51,7 @@ static inline int ext4_acl_count(size_t size)
 	}
 }
 
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
    if the ACL has not been cached */
@@ -62,7 +62,7 @@ extern int ext4_permission(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
-#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_permission NULL
 
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
-#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
 
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 2875eeca172..5c124c0ac6d 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -66,7 +66,7 @@ struct ext4_inode_info {
 	__u32	i_state;		/* Dynamic state flags for ext4 */
 
 	ext4_lblk_t		i_dir_start_lookup;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
 	 * data. Taking i_mutex even when reading would cause contention
@@ -76,7 +76,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6d5be156202..6bd11fba71f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -165,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
 	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a4747867411..9b4ec9decfd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4041,7 +4041,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5295a9225cf..92db9e94514 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1773,7 +1773,7 @@ retry:
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 		inode->i_op = &ext4_special_inode_operations;
 #endif
 		err = ext4_add_nondir(handle, dentry, inode);
@@ -2456,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.mknod		= ext4_mknod,
 	.rename		= ext4_rename,
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -2467,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 
 const struct inode_operations ext4_special_inode_operations = {
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7d865608e81..0e661c56966 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -569,7 +569,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
@@ -605,7 +605,7 @@ static void init_once(void *foo)
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
 #endif
 	init_rwsem(&ei->i_data_sem);
@@ -631,7 +631,7 @@ static void destroy_inodecache(void)
 
 static void ext4_clear_inode(struct inode *inode)
 {
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (EXT4_I(inode)->i_acl &&
 			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
 		posix_acl_release(EXT4_I(inode)->i_acl);
@@ -720,7 +720,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",debug");
 	if (test_opt(sb, OLDALLOC))
 		seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	if (test_opt(sb, XATTR_USER) &&
 		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -729,7 +729,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nouser_xattr");
 	}
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",acl");
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -1078,7 +1078,7 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_orlov:
 			clear_opt(sbi->s_mount_opt, OLDALLOC);
 			break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 		case Opt_user_xattr:
 			set_opt(sbi->s_mount_opt, XATTR_USER);
 			break;
@@ -1092,7 +1092,7 @@ static int parse_options(char *options, struct super_block *sb,
 			       "not supported\n");
 			break;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 		case Opt_acl:
 			set_opt(sbi->s_mount_opt, POSIX_ACL);
 			break;
@@ -1987,11 +1987,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, GRPID);
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
 		set_opt(sbi->s_mount_opt, XATTR_USER);
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (def_mount_opts & EXT4_DEFM_ACL)
 		set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
@@ -2049,16 +2049,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		       "EXT4-fs warning: feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended\n");
 
-	/*
-	 * Since ext4 is still considered development code, we require
-	 * that the TEST_FILESYS flag in s->flags be set.
-	 */
-	if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
-		printk(KERN_WARNING "EXT4-fs: %s: not marked "
-		       "OK to use with test code.\n", sb->s_id);
-		goto failed_mount;
-	}
-
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
@@ -3580,13 +3570,34 @@ const struct file_operations ext4_ui_proc_fops = {
 };
 #endif
 
+static struct file_system_type ext4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+#ifdef CONFIG_EXT4DEV_COMPAT
+static int ext4dev_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+	       "to mount using ext4\n");
+	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+	       "will go away by 2.6.31\n");
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+
 static struct file_system_type ext4dev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4dev",
-	.get_sb		= ext4_get_sb,
+	.get_sb		= ext4dev_get_sb,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS("ext4dev");
+#endif
 
 static int __init init_ext4_fs(void)
 {
@@ -3603,9 +3614,16 @@ static int __init init_ext4_fs(void)
 	err = init_inodecache();
 	if (err)
 		goto out1;
-	err = register_filesystem(&ext4dev_fs_type);
+	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+#ifdef CONFIG_EXT4DEV_COMPAT
+	err = register_filesystem(&ext4dev_fs_type);
+	if (err) {
+		unregister_filesystem(&ext4_fs_type);
+		goto out;
+	}
+#endif
 	return 0;
 out:
 	destroy_inodecache();
@@ -3618,7 +3636,10 @@ out2:
 
 static void __exit exit_ext4_fs(void)
 {
+	unregister_filesystem(&ext4_fs_type);
+#ifdef CONFIG_EXT4DEV_COMPAT
 	unregister_filesystem(&ext4dev_fs_type);
+#endif
 	destroy_inodecache();
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 0013d52f73b..00740cb32be 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 362b0edd3db..80626d516fe 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
 
 static struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
 	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
 #endif
 	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
 #endif
 };
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	&ext4_xattr_acl_access_handler,
 	&ext4_xattr_acl_default_handler,
 #endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	&ext4_xattr_security_handler,
 #endif
 	NULL
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 814ea58d4d5..8ede88b18c2 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+# ifdef CONFIG_EXT4_FS_XATTR
 
 extern struct xattr_handler ext4_xattr_user_handler;
 extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
 
 extern struct xattr_handler *ext4_xattr_handlers[];
 
-# else  /* CONFIG_EXT4DEV_FS_XATTR */
+# else  /* CONFIG_EXT4_FS_XATTR */
 
 static inline int
 ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 
 #define ext4_xattr_handlers	NULL
 
-# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+# endif  /* CONFIG_EXT4_FS_XATTR */
 
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
 				struct inode *dir);
 #else
-- 
cgit v1.2.3


From 77e841de8abac4755cc83ca224fdf71418d65380 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sun, 12 Oct 2008 16:39:16 -0400
Subject: jbd2: abort when failed to log metadata buffers

If we failed to write metadata buffers to the journal space and
succeeded to write the commit record, stale data can be written
back to the filesystem as metadata in the recovery phase.

To avoid this, when we failed to write out metadata buffers,
abort the journal before writing the commit record.

We can also avoid this kind of corruption by using the journal
checksum feature because it can detect invalid metadata blocks in the
journal and avoid them from being replayed.  So we don't need to care
about asynchronous commit record writeout with a checksum.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/commit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0d3814a35ed..78e4da93412 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -783,6 +783,9 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-- 
cgit v1.2.3


From 44519faf22ad6ce924ad0352d3dc200d9e0b66e8 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 10 Oct 2008 20:29:13 -0400
Subject: jbd2: fix error handling for checkpoint io

When a checkpointing IO fails, current JBD2 code doesn't check the
error and continue journaling.  This means latest metadata can be
lost from both the journal and filesystem.

This patch leaves the failed metadata blocks in the journal space
and aborts journaling in the case of jbd2_log_do_checkpoint().
To achieve this, we need to do:

1. don't remove the failed buffer from the checkpoint list where in
   the case of __try_to_free_cp_buf() because it may be released or
   overwritten by a later transaction
2. jbd2_log_do_checkpoint() is the last chance, remove the failed
   buffer from the checkpoint list and abort the journal
3. when checkpointing fails, don't update the journal super block to
   prevent the journaled contents from being cleaned.  For safety,
   don't update j_tail and j_tail_sequence either
4. when checkpointing fails, notify this error to the ext4 layer so
   that ext4 don't clear the needs_recovery flag, otherwise the
   journaled contents are ignored and cleaned in the recovery phase
5. if the recovery fails, keep the needs_recovery flag
6. prevent jbd2_cleanup_journal_tail() from being called between
   __jbd2_journal_drop_transaction() and jbd2_journal_abort()
   (a possible race issue between jbd2_log_do_checkpoint()s called by
   jbd2_journal_flush() and __jbd2_log_wait_for_space())

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 49 +++++++++++++++++++++++++++++++++++++------------
 fs/jbd2/journal.c    | 28 ++++++++++++++++++++++------
 fs/jbd2/recovery.c   |  7 +++++--
 3 files changed, 64 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 42895d36945..9203c3332f1 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -94,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
@@ -176,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
+	int ret = 0;
 
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
-		return;
+		return ret;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
@@ -210,6 +215,9 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
@@ -219,6 +227,8 @@ restart:
 		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
+
+	return ret;
 }
 
 #define NR_BATCH	64
@@ -242,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -274,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd2_log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__jbd2_journal_remove_checkpoint(jh);
@@ -281,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd_unlock_bh_state(bh);
 		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
-		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
@@ -314,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
@@ -339,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
+	result = 0;
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
@@ -357,7 +372,7 @@ restart:
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
-		int retry = 0;
+		int retry = 0, err;
 
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
@@ -371,6 +386,8 @@ restart:
 			}
 			retry = __process_buffer(journal, jh, bhs, &batch_count,
 						 transaction);
+			if (retry < 0 && !result)
+				result = retry;
 			if (!retry && (need_resched() ||
 				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
@@ -395,14 +412,18 @@ restart:
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
-		__wait_cp_io(journal, transaction);
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-	result = jbd2_cleanup_journal_tail(journal);
 	if (result < 0)
-		return result;
-	return 0;
+		jbd2_journal_abort(journal, result);
+	else
+		result = jbd2_cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
 }
 
 /*
@@ -418,8 +439,9 @@ out:
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
 
 int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -428,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 
+	if (is_journal_aborted(journal))
+		return 1;
+
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 01c3901c3a0..783de118de9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1451,9 +1451,12 @@ recovery_error:
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void jbd2_journal_destroy(journal_t *journal)
+int jbd2_journal_destroy(journal_t *journal)
 {
+	int err = 0;
+
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
 
@@ -1476,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
 
-	/* We can now mark the journal as empty. */
-	journal->j_tail = 0;
-	journal->j_tail_sequence = ++journal->j_transaction_sequence;
 	if (journal->j_sb_buffer) {
-		jbd2_journal_update_superblock(journal, 1);
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			jbd2_journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
 		brelse(journal->j_sb_buffer);
 	}
 
@@ -1492,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
 		jbd2_journal_destroy_revoke(journal);
 	kfree(journal->j_wbuf);
 	kfree(journal);
+
+	return err;
 }
 
 
@@ -1717,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		err = jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
 	jbd2_cleanup_journal_tail(journal);
 
 	/* Finally, mark the journal as really needing no recovery.
@@ -1742,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	spin_unlock(&journal->j_state_lock);
-	return err;
+	return 0;
 }
 
 /**
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b7..73063285b13 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do {									\
  */
 int jbd2_journal_recover(journal_t *journal)
 {
-	int			err;
+	int			err, err2;
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
 	journal->j_transaction_sequence = ++info.end_transaction;
 
 	jbd2_journal_clear_revoke(journal);
-	sync_blockdev(journal->j_fs_dev);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 7ffe1ea8949c75ecffb7a4d988bb881a9fa62fbe Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 10 Oct 2008 20:29:21 -0400
Subject: ext4: add checks for errors from jbd2

If the journal has aborted due to a checkpointing failure, we
have to keep the contents of the journal space.  Otherwise, the
filesystem will lose uncheckpointed metadata completely and
become inconsistent.  To avoid this, we need to keep needs_recovery
flag if checkpoint has failed.

With this patch, ext4_put_super() detects a checkpointing failure
from the return value of journal_destroy(), then it invokes
ext4_abort() to make the filesystem read only and keep
needs_recovery flag.  Errors from jbd2_journal_flush() are also
handled by this patch in some places.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 12 ++++++++----
 fs/ext4/super.c | 23 +++++++++++++++++++----
 2 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ea27eaa0cfe..dc99b4776d5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -192,7 +192,7 @@ setversion_out:
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -206,8 +206,10 @@ setversion_out:
 
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
 
 		return err;
@@ -215,7 +217,7 @@ setversion_out:
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -230,8 +232,10 @@ setversion_out:
 
 		err = ext4_group_add(sb, &input);
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
 
 		return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0e661c56966..79bd3989e84 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -507,7 +507,8 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	jbd2_journal_destroy(sbi->s_journal);
+	if (jbd2_journal_destroy(sbi->s_journal) < 0)
+		ext4_abort(sb, __func__, "Couldn't clean up the journal");
 	sbi->s_journal = NULL;
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2853,7 +2854,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
 	jbd2_journal_lock_updates(journal);
-	jbd2_journal_flush(journal);
+	if (jbd2_journal_flush(journal) < 0)
+		goto out;
+
 	lock_super(sb);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
@@ -2862,6 +2865,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 		ext4_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
+
+out:
 	jbd2_journal_unlock_updates(journal);
 }
 
@@ -2962,7 +2967,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 
 		/* Now we set up the journal barrier. */
 		jbd2_journal_lock_updates(journal);
-		jbd2_journal_flush(journal);
+
+		/*
+		 * We don't want to clear needs_recovery flag when we failed
+		 * to flush the journal.
+		 */
+		if (jbd2_journal_flush(journal) < 0)
+			return;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -3402,8 +3413,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		 * otherwise be livelocked...
 		 */
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err) {
+			path_put(&nd.path);
+			return err;
+		}
 	}
 
 	err = vfs_quota_on_path(sb, type, format_id, &nd.path);
-- 
cgit v1.2.3


From 7ad7445f60fe4d46c4c9d2a9463db180d2a3b270 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 10 Oct 2008 20:29:31 -0400
Subject: jbd2: don't dirty original metadata buffer on abort

Currently, original metadata buffers are dirtied when they are
unfiled whether the journal has aborted or not.  Eventually these
buffers will be written-back to the filesystem by pdflush.  This
means some metadata buffers are written to the filesystem without
journaling if the journal aborts.  So if both journal abort and
system crash happen at the same time, the filesystem would become
inconsistent state.  Additionally, replaying journaled metadata
can overwrite the latest metadata on the filesystem partly.
Because, if the journal gets aborted, journaled metadata are
preserved and replayed during the next mount not to lose
uncheckpointed metadata.  This would also break the consistency
of the filesystem.

This patch prevents original metadata buffers from being dirtied
on abort by clearing BH_JBDDirty flag from those buffers.  Thus,
no metadata buffers are written to the filesystem without journaling.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/commit.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 78e4da93412..849f10496ce 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -504,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jh = commit_transaction->t_buffers;
 
 		/* If we're in abort mode, we just un-journal the buffer and
-		   release it for background writing. */
+		   release it. */
 
 		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			jbd2_journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
@@ -884,6 +885,8 @@ restart_loop:
 		if (buffer_jbddirty(bh)) {
 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__jbd2_journal_refile_buffer(jh);
 			jbd_unlock_bh_state(bh);
-- 
cgit v1.2.3


From 5bf5683a33f3584da6eced480967c4f7e11515a8 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 10 Oct 2008 22:12:43 -0400
Subject: ext4: add an option to control error handling on file data

If the journal doesn't abort when it gets an IO error in file data
blocks, the file data corruption will spread silently.  Because
most of applications and commands do buffered writes without fsync(),
they don't notice the IO error.  It's scary for mission critical
systems.  On the other hand, if the journal aborts whenever it gets
an IO error in file data blocks, the system will easily become
inoperable.  So this patch introduces a filesystem option to
determine whether it aborts the journal or just call printk() when
it gets an IO error in file data.

If you mount an ext4 fs with data_err=abort option, it aborts on file
data write error.  If you mount it with data_err=ignore, it doesn't
abort, just call printk().  data_err=ignore is the default.

Here is the corresponding patch of the ext3 version:
http://kerneltrap.org/mailarchive/linux-kernel/2008/9/9/3239374

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  2 ++
 fs/ext4/super.c  | 16 ++++++++++++++++
 fs/jbd2/commit.c |  2 ++
 3 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f46a513a515..6690a41cdd9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -540,6 +540,8 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 79bd3989e84..014677b8e22 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -778,6 +778,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",inode_readahead_blks=%u",
 			   sbi->s_inode_readahead_blks);
 
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -907,6 +910,7 @@ enum {
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -953,6 +957,8 @@ static match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1187,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -2535,6 +2547,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 849f10496ce..0abe02c4242 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -684,6 +684,8 @@ start_journal_io:
 		printk(KERN_WARNING
 			"JBD2: Detected IO errors while flushing file data "
 		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
 		err = 0;
 	}
 
-- 
cgit v1.2.3


From 14835a3325c1f84c3ae6eaf81102a3917e84809e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 12 Oct 2008 13:34:11 +0000
Subject: [CIFS] cifs: remove pointless lock and unlock of GlobalMid_Lock in
 header_assemble

We lock GlobalMid_Lock in header_assemble and then immediately unlock it
again without doing anything. Not sure what this was intended to do, but
remove it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 654d972a88f..88786ba02d2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -311,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 	buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
 	buffer->Pid = cpu_to_le16((__u16)current->tgid);
 	buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
-	spin_lock(&GlobalMid_Lock);
-	spin_unlock(&GlobalMid_Lock);
 	if (treeCon) {
 		buffer->Tid = treeCon->tid;
 		if (treeCon->ses) {
-- 
cgit v1.2.3


From 06270d5d6aefb46b88bf44a7c5b1b9b3ef352c48 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 12 Oct 2008 07:15:19 +0300
Subject: provide generic_block_fiemap() only with BLOCK=y

This fixes the following compile error with CONFIG_BLOCK=n caused by
commit 68c9d702bb72f367f3b148963ec6cf5e07ff7f65 ("generic block based
fiemap implementation"):

    CC      fs/ioctl.o
  fs/ioctl.c: In function 'generic_block_fiemap':
  fs/ioctl.c:249: error: storage size of 'tmp' isn't known
  fs/ioctl.c:272: error: invalid application of 'sizeof' to incomplete type 'struct buffer_head'
  fs/ioctl.c:280: error: implicit declaration of function 'buffer_mapped'
  fs/ioctl.c:249: warning: unused variable 'tmp'
  make[2]: *** [fs/ioctl.o] Error 1

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ioctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 33a6b7ecb8b..d152856c371 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -226,6 +226,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	return error;
 }
 
+#ifdef CONFIG_BLOCK
+
 #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
 #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
 
@@ -342,6 +344,8 @@ int generic_block_fiemap(struct inode *inode,
 }
 EXPORT_SYMBOL(generic_block_fiemap);
 
+#endif  /*  CONFIG_BLOCK  */
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
-- 
cgit v1.2.3


From 3244fcb1ae03362e4aa8cb1a9039fbfd61661859 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Sun, 12 Oct 2008 17:27:49 -0400
Subject: ext4: fix build failure without procfs

fs/ext4/super.c: In function 'ext4_fill_super':
fs/ext4/super.c:2226: error: 'ext4_ui_proc_fops' undeclared (first use
in this function)
fs/ext4/super.c:2226: error: (Each undeclared identifier is reported
only once
fs/ext4/super.c:2226: error: for each function it appears in.)

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 014677b8e22..fb940c22ab0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2231,6 +2231,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+#ifdef CONFIG_PROC_FS
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 
@@ -2238,6 +2239,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
 				 &ext4_ui_proc_fops,
 				 &sbi->s_inode_readahead_blks);
+#endif
 
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
-- 
cgit v1.2.3


From f319fb8bf6899e08bdb8d1e09a4e7a129dfa2312 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Sun, 12 Oct 2008 15:53:01 -0400
Subject: ext4: fix kconfig typo and extra whitespace

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 40183d94b68..f54a157a029 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -170,8 +170,8 @@ config EXT4DEV_COMPAT
 	help
 	  Starting with 2.6.28, the name of the ext4 filesystem was
 	  renamed from ext4dev to ext4.  Unfortunately there are some
-	  lagecy userspace programs (such as klibc's fstype) have
-	  "ext4dev" hardcoded.  
+	  legacy userspace programs (such as klibc's fstype) have
+	  "ext4dev" hardcoded.
 
 	  To enable backwards compatibility so that systems that are
 	  still expecting to mount ext4 filesystems using ext4dev,
-- 
cgit v1.2.3