aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c18
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/callback.c2
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/aio.c4
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/bio-integrity.c719
-rw-r--r--fs/bio.c88
-rw-r--r--fs/block_dev.c16
-rw-r--r--fs/buffer.c32
-rw-r--r--fs/char_dev.c7
-rw-r--r--fs/cifs/CHANGES5
-rw-r--r--fs/cifs/asn1.c14
-rw-r--r--fs/cifs/cifsacl.c10
-rw-r--r--fs/cifs/cifsfs.c23
-rw-r--r--fs/cifs/cifsglob.h3
-rw-r--r--fs/cifs/cifspdu.h23
-rw-r--r--fs/cifs/cifssmb.c12
-rw-r--r--fs/cifs/connect.c5
-rw-r--r--fs/cifs/dir.c4
-rw-r--r--fs/cifs/file.c7
-rw-r--r--fs/cifs/inode.c168
-rw-r--r--fs/cifs/misc.c3
-rw-r--r--fs/cifs/readdir.c77
-rw-r--r--fs/dcache.c68
-rw-r--r--fs/dlm/user.c9
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c3
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/ecryptfs/read_write.c22
-rw-r--r--fs/exec.c11
-rw-r--r--fs/ext3/resize.c3
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/balloc.c278
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c58
-rw-r--r--fs/ext4/super.c180
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/file.c50
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fuse/inode.c11
-rw-r--r--fs/gfs2/bmap.c23
-rw-r--r--fs/gfs2/ops_file.c4
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c295
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/recovery.c12
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/libfs.c46
-rw-r--r--fs/locks.c6
-rw-r--r--fs/mpage.c14
-rw-r--r--fs/msdos/namei.c35
-rw-r--r--fs/namei.c26
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/ncpfs/file.c12
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/file.c6
-rw-r--r--fs/nfs/mount_clnt.c5
-rw-r--r--fs/nfs/super.c76
-rw-r--r--fs/nfs/write.c7
-rw-r--r--fs/ntfs/upcase.c5
-rw-r--r--fs/ocfs2/cluster/nodemanager.c74
-rw-r--r--fs/ocfs2/cluster/nodemanager.h4
-rw-r--r--fs/ocfs2/cluster/tcp.c28
-rw-r--r--fs/ocfs2/cluster/tcp.h12
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h32
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h12
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/stack_o2cb.c41
-rw-r--r--fs/ocfs2/stack_user.c40
-rw-r--r--fs/ocfs2/stackglue.c119
-rw-r--r--fs/ocfs2/stackglue.h19
-rw-r--r--fs/open.c37
-rw-r--r--fs/pipe.c10
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c42
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/proc_misc.c24
-rw-r--r--fs/proc/task_mmu.c181
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-mmu.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/select.c2
-rw-r--r--fs/smbfs/file.c11
-rw-r--r--fs/splice.c34
-rw-r--r--fs/udf/super.c57
-rw-r--r--fs/udf/udfdecl.h2
-rw-r--r--fs/utimes.c59
-rw-r--r--fs/vfat/namei.c35
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h19
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h8
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_log.c15
-rw-r--r--fs/xfs/xfs_vnodeops.c112
-rw-r--r--fs/xfs/xfs_vnodeops.h3
128 files changed, 4688 insertions, 1950 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada..57997fa14e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
void v9fs_dentry_release(struct dentry *);
-int v9fs_uflags2omode(int uflags);
+int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 0d55affe37d..52944d2249a 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- omode = v9fs_uflags2omode(file->f_flags);
+ omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
inode->i_size = 0;
inode->i_blocks = 0;
}
+ if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+ generic_file_llseek(file, 0, SEEK_END);
}
file->private_data = fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 40fa807bd92..c95295c6504 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
/**
* v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
* @uflags: flags to convert
- *
+ * @extended: if .u extensions are active
*/
-int v9fs_uflags2omode(int uflags)
+int v9fs_uflags2omode(int uflags, int extended)
{
int ret;
@@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
break;
}
- if (uflags & O_EXCL)
- ret |= P9_OEXCL;
-
if (uflags & O_TRUNC)
ret |= P9_OTRUNC;
- if (uflags & O_APPEND)
- ret |= P9_OAPPEND;
+ if (extended) {
+ if (uflags & O_EXCL)
+ ret |= P9_OEXCL;
+
+ if (uflags & O_APPEND)
+ ret |= P9_OAPPEND;
+ }
return ret;
}
@@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
flags = O_RDWR;
fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags));
+ v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
fid = NULL;
diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c..313b2e06ded 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -830,7 +830,7 @@ config NTFS_FS
from the project web site.
For more information see <file:Documentation/filesystems/ntfs.txt>
- and <http://linux-ntfs.sourceforge.net/>.
+ and <http://www.linux-ntfs.org/>.
To compile this file system support as a module, choose M here: the
module will be called ntfs.
@@ -930,7 +930,7 @@ config PROC_KCORE
config PROC_VMCORE
bool "/proc/vmcore support (EXPERIMENTAL)"
- depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+ depends on PROC_FS && CRASH_DUMP
default y
help
Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 55e8ee1900a..3263084eef9 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
config BINFMT_FLAT
bool "Kernel support for flat binaries"
- depends on !MMU
+ depends on !MMU && (!FRV || BROKEN)
help
Support uClinux FLAT format binaries.
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da..277b079dec9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
obj-y += no-block.o
endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index a78d5b236bb..587ef5123cd 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -8,7 +8,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>
* David Howells <dhowells@redhat.com>
*
*/
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 08db82e1343..bb47217f6a1 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -8,7 +8,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>
* David Howells <dhowells@redhat.com>
*
*/
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 4b572b801d8..7e3faeef681 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -10,7 +10,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Authors: David Howells <dhowells@redhat.com>
- * David Woodhouse <dwmw2@redhat.com>
+ * David Woodhouse <dwmw2@infradead.org>
*
*/
diff --git a/fs/aio.c b/fs/aio.c
index b5253e77eb2..0fb3117ddd9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -591,10 +591,6 @@ static void use_mm(struct mm_struct *mm)
atomic_inc(&mm->mm_count);
tsk->mm = mm;
tsk->active_mm = mm;
- /*
- * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
- * it won't work. Update it accordingly if you change it here
- */
switch_mm(active_mm, mm, tsk);
task_unlock(tsk);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0fa95b198e6..d48ff5f370f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -16,7 +16,6 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/a.out.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/binfmts.h>
@@ -548,7 +547,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
struct {
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
- struct exec interp_ex;
} *loc;
loc = kmalloc(sizeof(*loc), GFP_KERNEL);
@@ -680,7 +678,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
}
/* Get the exec headers */
- loc->interp_ex = *((struct exec *)bprm->buf);
loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ddd35d87339..d051a32e627 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -390,7 +390,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
}
/* expand the stack mapping to use up the entire allocation granule */
- fullsize = ksize((char *) current->mm->start_brk);
+ fullsize = kobjsize((char *) current->mm->start_brk);
if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
fullsize, 0, 0)))
stack_size = fullsize;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 3b40d45a3a1..2cb1acda3a8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -548,7 +548,7 @@ static int load_flat_file(struct linux_binprm * bprm,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
/* Remap to use all availabe slack region space */
if (realdatastart && (realdatastart < (unsigned long)-4096)) {
- reallen = ksize((void *)realdatastart);
+ reallen = kobjsize((void *)realdatastart);
if (reallen > len) {
realdatastart = do_mremap(realdatastart, len,
reallen, MREMAP_FIXED, realdatastart);
@@ -600,7 +600,7 @@ static int load_flat_file(struct linux_binprm * bprm,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
/* Remap to use all availabe slack region space */
if (textpos && (textpos < (unsigned long) -4096)) {
- reallen = ksize((void *)textpos);
+ reallen = kobjsize((void *)textpos);
if (reallen > len) {
textpos = do_mremap(textpos, len, reallen,
MREMAP_FIXED, textpos);
@@ -683,7 +683,7 @@ static int load_flat_file(struct linux_binprm * bprm,
*/
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
- current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
+ current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
}
if (flags & FLAT_FLAG_KTRACE)
@@ -790,7 +790,7 @@ static int load_flat_file(struct linux_binprm * bprm,
/* zero the BSS, BRK and stack areas */
memset((void*)(datapos + data_len), 0, bss_len +
- (memp + ksize((void *) memp) - stack_len - /* end brk */
+ (memp + kobjsize((void *) memp) - stack_len - /* end brk */
libinfo->lib_list[id].start_brk) + /* start brk */
stack_len);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 00000000000..63e2ee63058
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ * @bs: bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip;
+ struct bio_vec *iv;
+ unsigned long idx;
+
+ BUG_ON(bio == NULL);
+
+ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+ return NULL;
+ }
+
+ memset(bip, 0, sizeof(*bip));
+
+ iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+ if (unlikely(iv == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+ mempool_free(bip, bs->bio_integrity_pool);
+ return NULL;
+ }
+
+ bip->bip_pool = idx;
+ bip->bip_vec = iv;
+ bip->bip_bio = bio;
+ bio->bi_integrity = bip;
+
+ return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs)
+{
+ return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio: bio containing bip to be freed
+ * @bs: bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip == NULL);
+
+ /* A cloned bio doesn't own the integrity metadata */
+ if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+ kfree(bip->bip_buf);
+
+ mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+ mempool_free(bip, bs->bio_integrity_pool);
+
+ bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio: bio to update
+ * @page: page containing integrity metadata
+ * @len: number of bytes of integrity metadata in page
+ * @offset: start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct bio_vec *iv;
+
+ if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+ printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ return 0;
+ }
+
+ iv = bip_vec_idx(bip, bip->bip_vcnt);
+ BUG_ON(iv == NULL);
+ BUG_ON(iv->bv_page != NULL);
+
+ iv->bv_page = page;
+ iv->bv_len = len;
+ iv->bv_offset = offset;
+ bip->bip_vcnt++;
+
+ return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio: bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not. bio data direction and target device must be
+ * set prior to calling. The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return 0;
+
+ return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi: blk_integrity profile for device
+ * @sectors: Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device. Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ /* At this point there are only 512b or 4096b DIF/EPP devices */
+ if (bi->sector_size == 4096)
+ return sectors >>= 3;
+
+ return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio: bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+ BUG_ON(bio->bi_size == 0);
+
+ return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip->bip_buf == NULL);
+
+ if (bi->tag_size == 0)
+ return -1;
+
+ nr_sectors = bio_integrity_hw_sectors(bi,
+ DIV_ROUND_UP(len, bi->tag_size));
+
+ if (nr_sectors * bi->tuple_size > bip->bip_size) {
+ printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+ __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+ return -1;
+ }
+
+ if (set)
+ bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+ else
+ bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+ return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio: bio to attach buffer to
+ * @tag_buf: Pointer to a buffer containing tag data
+ * @len: Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection. The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != WRITE);
+
+ return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio: bio to retrieve buffer from
+ * @tag_buf: Pointer to a buffer for the tag data
+ * @len: Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != READ);
+
+ return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio: bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function. The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_sector;
+ unsigned int i, sectors, total;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ bi->generate_fn(&bix);
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio. The bio must have data
+ * direction, target device and start sector set priot to calling. In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function. In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+ struct bio_integrity_payload *bip;
+ struct blk_integrity *bi;
+ struct request_queue *q;
+ void *buf;
+ unsigned long start, end;
+ unsigned int len, nr_pages;
+ unsigned int bytes, offset, i;
+ unsigned int sectors;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ q = bdev_get_queue(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bio_integrity(bio));
+
+ sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+ /* Allocate kernel buffer for protection data */
+ len = sectors * blk_integrity_tuple_size(bi);
+ buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+ if (unlikely(buf == NULL)) {
+ printk(KERN_ERR "could not allocate integrity buffer\n");
+ return -EIO;
+ }
+
+ end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ((unsigned long) buf) >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ /* Allocate bio integrity payload and integrity vectors */
+ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "could not allocate data integrity bioset\n");
+ kfree(buf);
+ return -EIO;
+ }
+
+ bip->bip_buf = buf;
+ bip->bip_size = len;
+ bip->bip_sector = bio->bi_sector;
+
+ /* Map it */
+ offset = offset_in_page(buf);
+ for (i = 0 ; i < nr_pages ; i++) {
+ int ret;
+ bytes = PAGE_SIZE - offset;
+
+ if (len <= 0)
+ break;
+
+ if (bytes > len)
+ bytes = len;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf),
+ bytes, offset);
+
+ if (ret == 0)
+ return 0;
+
+ if (ret < bytes)
+ break;
+
+ buf += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+
+ /* Install custom I/O completion handler if read verify is enabled */
+ if (bio_data_dir(bio) == READ) {
+ bip->bip_end_io = bio->bi_end_io;
+ bio->bi_end_io = bio_integrity_endio;
+ }
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE)
+ bio_integrity_generate(bio);
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio: bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio. The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_integrity->bip_sector;
+ unsigned int i, sectors, total, ret;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ ret = total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ ret = bi->verify_fn(&bix);
+
+ if (ret) {
+ kunmap_atomic(kaddr, KM_USER0);
+ break;
+ }
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work: Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request. The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_payload *bip =
+ container_of(work, struct bio_integrity_payload, bip_work);
+ struct bio *bio = bip->bip_bio;
+ int error = bip->bip_error;
+
+ if (bio_integrity_verify(bio)) {
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ error = -EIO;
+ }
+
+ /* Restore original bio completion handler */
+ bio->bi_end_io = bip->bip_end_io;
+
+ if (bio->bi_end_io)
+ bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ * @error: Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context. However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context. This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip->bip_bio != bio);
+
+ bip->bip_error = error;
+ INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip: Integrity vector to advance
+ * @skip: Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+ unsigned int skip)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (skip == 0) {
+ bip->bip_idx = i;
+ return;
+ } else if (skip >= iv->bv_len) {
+ skip -= iv->bv_len;
+ } else { /* skip < iv->bv_len) */
+ iv->bv_offset += skip;
+ iv->bv_len -= skip;
+ bip->bip_idx = i;
+ return;
+ }
+ }
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip: Integrity vector to truncate
+ * @len: New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+ unsigned int len)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (len == 0) {
+ bip->bip_vcnt = i;
+ return;
+ } else if (len >= iv->bv_len) {
+ len -= iv->bv_len;
+ } else { /* len < iv->bv_len) */
+ iv->bv_len = len;
+ len = 0;
+ }
+ }
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio: bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+ bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio: bio whose integrity vector to update
+ * @offset: offset to first data sector
+ * @sectors: number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+ unsigned int sectors)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+ BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+ bip->bip_sector = bip->bip_sector + offset;
+ bio_integrity_mark_head(bip, offset * bi->tuple_size);
+ bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio: Protected bio
+ * @bp: Resulting bio_pair
+ * @sectors: Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+ struct blk_integrity *bi;
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ unsigned int nr_sectors;
+
+ if (bio_integrity(bio) == 0)
+ return;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bip->bip_vcnt != 1);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+ bp->bio1.bi_integrity = &bp->bip1;
+ bp->bio2.bi_integrity = &bp->bip2;
+
+ bp->iv1 = bip->bip_vec[0];
+ bp->iv2 = bip->bip_vec[0];
+
+ bp->bip1.bip_vec = &bp->iv1;
+ bp->bip2.bip_vec = &bp->iv2;
+
+ bp->iv1.bv_len = sectors * bi->tuple_size;
+ bp->iv2.bv_offset += sectors * bi->tuple_size;
+ bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+ bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+ bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+ bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+ bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio: New bio
+ * @bio_src: Original bio
+ * @bs: bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+ struct bio_integrity_payload *bip;
+
+ BUG_ON(bip_src == NULL);
+
+ bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+ if (bip == NULL)
+ return -EIO;
+
+ memcpy(bip->bip_vec, bip_src->bip_vec,
+ bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+ bip->bip_sector = bip_src->bip_sector;
+ bip->bip_vcnt = bip_src->bip_vcnt;
+ bip->bip_idx = bip_src->bip_idx;
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+ bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+ bio_integrity_slab);
+ if (!bs->bio_integrity_pool)
+ return -1;
+
+ return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+ if (bs->bio_integrity_pool)
+ mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+ bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+
+static int __init integrity_init(void)
+{
+ kintegrityd_wq = create_workqueue("kintegrityd");
+
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+
+ return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb5..88322b066ac 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
#include <linux/blktrace_api.h>
#include <scsi/sg.h> /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
-
static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
- int nr_vecs;
- char *name;
- struct kmem_cache *slab;
-};
-
/*
* if you change this list, also change bvec_alloc or things will
* break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
#undef BV
/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
- mempool_t *bio_pool;
- mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
-/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+ return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
{
struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
}
+ if (bio_integrity(bio))
+ bio_integrity_free(bio, bio_set);
+
mempool_free(bio, bio_set->bio_pool);
}
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
{
struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
- if (b) {
- b->bi_destructor = bio_fs_destructor;
- __bio_clone(b, bio);
+ if (!b)
+ return NULL;
+
+ b->bi_destructor = bio_fs_destructor;
+ __bio_clone(b, bio);
+
+ if (bio_integrity(bio)) {
+ int ret;
+
+ ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+ if (ret < 0)
+ return NULL;
}
return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
prev->bv_len += len;
- if (q->merge_bvec_fn &&
- q->merge_bvec_fn(q, bio, prev) < len) {
- prev->bv_len -= len;
- return 0;
+
+ if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+ prev->bv_len -= len;
+ return 0;
+ }
}
goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* queue to get further control
*/
if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
/*
* merge_bvec_fn() returns number of bytes it can accept
* at this offset
*/
- if (q->merge_bvec_fn(q, bio, bvec) < len) {
+ if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
bvec->bv_page = NULL;
bvec->bv_len = 0;
bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
bp->bio1.bi_private = bi;
bp->bio2.bi_private = pool;
+ if (bio_integrity(bi))
+ bio_integrity_split(bi, bp, first_sectors);
+
return bp;
}
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
+ bioset_integrity_free(bs);
biovec_free_pools(bs);
kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
if (!bs->bio_pool)
goto bad;
+ if (bioset_integrity_create(bs, bio_pool_size))
+ goto bad;
+
if (!biovec_create_pools(bs, bvec_pool_size))
return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
{
bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ bio_integrity_init_slab();
biovec_init_slabs();
fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7d822fae776..10d8a0aa871 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -12,6 +12,7 @@
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/smp_lock.h>
+#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
#include <linux/module.h>
@@ -928,9 +929,22 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
{
struct module *owner = NULL;
struct gendisk *disk;
- int ret = -ENXIO;
+ int ret;
int part;
+ int perm = 0;
+
+ if (file->f_mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (file->f_mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ /*
+ * hooks: /n/, see "layering violations".
+ */
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret != 0)
+ return ret;
+ ret = -ENXIO;
file->f_mapping = bdev->bd_inode->i_mapping;
lock_kernel();
disk = get_gendisk(bdev->bd_dev, &part);
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f01..5fa1512cd9a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* contents - it is a noop if I/O is still in
* flight on potentially older contents.
*/
- ll_rw_block(SWRITE, 1, &bh);
+ ll_rw_block(SWRITE_SYNC, 1, &bh);
brelse(bh);
spin_lock(lock);
}
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
@@ -2940,16 +2953,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
- if (rw == SWRITE)
+ if (rw == SWRITE || rw == SWRITE_SYNC)
lock_buffer(bh);
else if (test_set_buffer_locked(bh))
continue;
- if (rw == WRITE || rw == SWRITE) {
+ if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(WRITE, bh);
+ if (rw == SWRITE_SYNC)
+ submit_bh(WRITE_SYNC, bh);
+ else
+ submit_bh(WRITE, bh);
continue;
}
} else {
@@ -2978,7 +2994,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
if (test_clear_buffer_dirty(bh)) {
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b8845..3cb7cda3d78 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
return -ENXIO;
new = container_of(kobj, struct cdev, kobj);
spin_lock(&cdev_lock);
+ /* Check i_cdev again in case somebody beat us to it while
+ we dropped the lock. */
p = inode->i_cdev;
if (!p) {
inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
cdev_put(p);
return -ENXIO;
}
- if (filp->f_op->open) {
- lock_kernel();
+ if (filp->f_op->open)
ret = filp->f_op->open(inode,filp);
- unlock_kernel();
- }
if (ret)
cdev_put(p);
return ret;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 28e3d5c5fca..1f3465201fd 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,6 +2,11 @@ Version 1.53
------------
DFS support added (Microsoft Distributed File System client support needed
for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior). Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
Version 1.52
------------
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cb52cbbe45f..f58e41d3ba4 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -186,6 +186,11 @@ asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
}
}
}
+
+ /* don't trust len bigger than ctx buffer */
+ if (*len > ctx->end - ctx->pointer)
+ return 0;
+
return 1;
}
@@ -203,6 +208,10 @@ asn1_header_decode(struct asn1_ctx *ctx,
if (!asn1_length_decode(ctx, &def, &len))
return 0;
+ /* primitive shall be definite, indefinite shall be constructed */
+ if (*con == ASN1_PRI && !def)
+ return 0;
+
if (def)
*eoc = ctx->pointer + len;
else
@@ -389,6 +398,11 @@ asn1_oid_decode(struct asn1_ctx *ctx,
unsigned long *optr;
size = eoc - ctx->pointer + 1;
+
+ /* first subid actually encodes first two subids */
+ if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+ return 0;
+
*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
if (*oid == NULL)
return 0;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff540..0e9fc2ba90e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+ {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+ {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5df93fd6303..22857c639df 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -97,9 +97,6 @@ cifs_read_super(struct super_block *sb, void *data,
{
struct inode *inode;
struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- int len;
-#endif
int rc = 0;
/* BB should we make this contingent on mount parm? */
@@ -117,15 +114,17 @@ cifs_read_super(struct super_block *sb, void *data,
* complex operation (mount), and in case of fail
* just exit instead of doing mount and attempting
* undo it if this copy fails?*/
- len = strlen(data);
- cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
- if (cifs_sb->mountdata == NULL) {
- kfree(sb->s_fs_info);
- sb->s_fs_info = NULL;
- return -ENOMEM;
+ if (data) {
+ int len = strlen(data);
+ cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+ if (cifs_sb->mountdata == NULL) {
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
+ return -ENOMEM;
+ }
+ strncpy(cifs_sb->mountdata, data, len + 1);
+ cifs_sb->mountdata[len] = '\0';
}
- strncpy(cifs_sb->mountdata, data, len + 1);
- cifs_sb->mountdata[len] = '\0';
#endif
rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -613,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(file, offset, origin);
+ return generic_file_llseek_unlocked(file, offset, origin);
}
struct file_system_type cifs_fs_type = {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 08914053242..9cfcf326ead 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -333,7 +333,6 @@ struct cifsFileInfo {
bool messageMode:1; /* for pipes: message vs byte mode */
atomic_t wrtPending; /* handle in use - defer close */
struct semaphore fh_sem; /* prevents reopen race after dead ses*/
- char *search_resume_name; /* BB removeme BB */
struct cifs_search_info srch_inf;
};
@@ -626,7 +625,7 @@ GLOBAL_EXTERN atomic_t tcpSesAllocCount;
GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
-/* Various Debug counters to remove someday (BB) */
+/* Various Debug counters */
GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */
#ifdef CONFIG_CIFS_STATS2
GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 65d58b4e6a6..0f327c224da 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -79,6 +79,19 @@
#define TRANS2_GET_DFS_REFERRAL 0x10
#define TRANS2_REPORT_DFS_INCOSISTENCY 0x11
+/* SMB Transact (Named Pipe) subcommand codes */
+#define TRANS_SET_NMPIPE_STATE 0x0001
+#define TRANS_RAW_READ_NMPIPE 0x0011
+#define TRANS_QUERY_NMPIPE_STATE 0x0021
+#define TRANS_QUERY_NMPIPE_INFO 0x0022
+#define TRANS_PEEK_NMPIPE 0x0023
+#define TRANS_TRANSACT_NMPIPE 0x0026
+#define TRANS_RAW_WRITE_NMPIPE 0x0031
+#define TRANS_READ_NMPIPE 0x0036
+#define TRANS_WRITE_NMPIPE 0x0037
+#define TRANS_WAIT_NMPIPE 0x0053
+#define TRANS_CALL_NMPIPE 0x0054
+
/* NT Transact subcommand codes */
#define NT_TRANSACT_CREATE 0x01
#define NT_TRANSACT_IOCTL 0x02
@@ -328,12 +341,13 @@
#define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */
#define CREATE_NO_EA_KNOWLEDGE 0x00000200
#define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete
- open for recovery flag - should
- be zero */
+ "open for recovery" flag - should
+ be zero in any case */
+#define CREATE_OPEN_FOR_RECOVERY 0x00000400
#define CREATE_RANDOM_ACCESS 0x00000800
#define CREATE_DELETE_ON_CLOSE 0x00001000
#define CREATE_OPEN_BY_ID 0x00002000
-#define CREATE_OPEN_BACKUP_INTN 0x00004000
+#define CREATE_OPEN_BACKUP_INTENT 0x00004000
#define CREATE_NO_COMPRESSION 0x00008000
#define CREATE_RESERVE_OPFILTER 0x00100000 /* should be zero */
#define OPEN_REPARSE_POINT 0x00200000
@@ -722,7 +736,6 @@ typedef struct smb_com_tconx_rsp_ext {
#define SMB_CSC_CACHE_AUTO_REINT 0x0004
#define SMB_CSC_CACHE_VDO 0x0008
#define SMB_CSC_NO_CACHING 0x000C
-
#define SMB_UNIQUE_FILE_NAME 0x0010
#define SMB_EXTENDED_SIGNATURES 0x0020
@@ -806,7 +819,7 @@ typedef struct smb_com_findclose_req {
#define ICOUNT_MASK 0x00FF
#define PIPE_READ_MODE 0x0100
#define NAMED_PIPE_TYPE 0x0400
-#define PIPE_END_POINT 0x0800
+#define PIPE_END_POINT 0x4000
#define BLOCKING_NAMED_PIPE 0x8000
typedef struct smb_com_open_req { /* also handles create */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9b8b4cfdf99..4511b708f0f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1728,7 +1728,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
{
int rc = 0;
LOCK_REQ *pSMB = NULL;
- LOCK_RSP *pSMBr = NULL;
+/* LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
int bytes_returned;
int timeout = 0;
__u16 count;
@@ -1739,8 +1739,6 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
if (rc)
return rc;
- pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */
-
if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
timeout = CIFS_ASYNC_OP; /* no response expected */
pSMB->Timeout = 0;
@@ -1774,7 +1772,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
if (waitFlag) {
rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
- (struct smb_hdr *) pSMBr, &bytes_returned);
+ (struct smb_hdr *) pSMB, &bytes_returned);
cifs_small_buf_release(pSMB);
} else {
rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
@@ -3927,9 +3925,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
}
ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
- if (ref->VersionNumber != 3) {
+ if (ref->VersionNumber != cpu_to_le16(3)) {
cERROR(1, ("Referrals of V%d version are not supported,"
- "should be V3", ref->VersionNumber));
+ "should be V3", le16_to_cpu(ref->VersionNumber)));
rc = -EINVAL;
goto parse_DFS_referrals_exit;
}
@@ -3977,7 +3975,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
if (rc)
goto parse_DFS_referrals_exit;
- ref += ref->Size;
+ ref += le16_to_cpu(ref->Size);
}
parse_DFS_referrals_exit:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 023434f72c1..e8fa46c7cff 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -653,6 +653,7 @@ multi_t2_fnd:
spin_lock(&GlobalMid_Lock);
server->tcpStatus = CifsExiting;
spin_unlock(&GlobalMid_Lock);
+ wake_up_all(&server->response_q);
/* don't exit until kthread_stop is called */
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -2120,6 +2121,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
}
+ if ((volume_info.cifs_acl) && (volume_info.dynperm))
+ cERROR(1, ("mount option dynperm ignored if cifsacl "
+ "mount option supported"));
+
tcon =
find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
volume_info.username);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f0b5b5f3dd2..fb69c1fa85c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -260,7 +260,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
buf, inode->i_sb, xid,
&fileHandle);
if (newinode) {
- newinode->i_mode = mode;
+ if (cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_DYNPERM)
+ newinode->i_mode = mode;
if ((oplock & CIFS_CREATE_ACTION) &&
(cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_SET_UID)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8636cec2642..0aac824371a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -546,7 +546,6 @@ int cifs_close(struct inode *inode, struct file *file)
msleep(timeout);
timeout *= 8;
}
- kfree(pSMBFile->search_resume_name);
kfree(file->private_data);
file->private_data = NULL;
} else
@@ -605,12 +604,6 @@ int cifs_closedir(struct inode *inode, struct file *file)
else
cifs_buf_release(ptmp);
}
- ptmp = pCFileStruct->search_resume_name;
- if (ptmp) {
- cFYI(1, ("closedir free resume name"));
- pCFileStruct->search_resume_name = NULL;
- kfree(ptmp);
- }
kfree(file->private_data);
file->private_data = NULL;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 129dbfe4dca..2e904bd111c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc) {
- if (rc == -EREMOTE && !is_dfs_referral) {
- is_dfs_referral = true;
- cFYI(DBG2, ("DFS ref"));
- /* for DFS, server does not give us real inode data */
- fill_fake_finddataunix(&find_data, sb);
- rc = 0;
- }
- }
+ if (rc == -EREMOTE && !is_dfs_referral) {
+ is_dfs_referral = true;
+ cFYI(DBG2, ("DFS ref"));
+ /* for DFS, server does not give us real inode data */
+ fill_fake_finddataunix(&find_data, sb);
+ rc = 0;
+ } else if (rc)
+ goto cgiiu_exit;
+
num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
end_of_file = le64_to_cpu(find_data.EndOfFile);
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
*pinode = new_inode(sb);
if (*pinode == NULL) {
rc = -ENOMEM;
- goto cgiiu_exit;
+ goto cgiiu_exit;
}
/* Is an i_ino of zero legal? */
/* note ino incremented to unique num in new_inode */
@@ -418,6 +418,7 @@ int cifs_get_inode_info(struct inode **pinode,
char *buf = NULL;
bool adjustTZ = false;
bool is_dfs_referral = false;
+ umode_t default_mode;
pTcon = cifs_sb->tcon;
cFYI(1, ("Getting info on %s", full_path));
@@ -530,47 +531,42 @@ int cifs_get_inode_info(struct inode **pinode,
inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
}
- /* set default mode. will override for dirs below */
- if (atomic_read(&cifsInfo->inUse) == 0)
- /* new inode, can safely set these fields */
- inode->i_mode = cifs_sb->mnt_file_mode;
- else /* since we set the inode type below we need to mask off
- to avoid strange results if type changes and both
- get orred in */
- inode->i_mode &= ~S_IFMT;
-/* if (attr & ATTR_REPARSE) */
- /* We no longer handle these as symlinks because we could not
- follow them due to the absolute path with drive letter */
- if (attr & ATTR_DIRECTORY) {
- /* override default perms since we do not do byte range locking
- on dirs */
- inode->i_mode = cifs_sb->mnt_dir_mode;
- inode->i_mode |= S_IFDIR;
- } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
- (cifsInfo->cifsAttrs & ATTR_SYSTEM) &&
- /* No need to le64 convert size of zero */
- (pfindData->EndOfFile == 0)) {
- inode->i_mode = cifs_sb->mnt_file_mode;
- inode->i_mode |= S_IFIFO;
-/* BB Finish for SFU style symlinks and devices */
- } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
- (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
- if (decode_sfu_inode(inode, le64_to_cpu(pfindData->EndOfFile),
- full_path, cifs_sb, xid))
- cFYI(1, ("Unrecognized sfu inode type"));
-
- cFYI(1, ("sfu mode 0%o", inode->i_mode));
+ /* get default inode mode */
+ if (attr & ATTR_DIRECTORY)
+ default_mode = cifs_sb->mnt_dir_mode;
+ else
+ default_mode = cifs_sb->mnt_file_mode;
+
+ /* set permission bits */
+ if (atomic_read(&cifsInfo->inUse) == 0 ||
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+ inode->i_mode = default_mode;
+ else {
+ /* just reenable write bits if !ATTR_READONLY */
+ if ((inode->i_mode & S_IWUGO) == 0 &&
+ (attr & ATTR_READONLY) == 0)
+ inode->i_mode |= (S_IWUGO & default_mode);
+ inode->i_mode &= ~S_IFMT;
+ }
+ /* clear write bits if ATTR_READONLY is set */
+ if (attr & ATTR_READONLY)
+ inode->i_mode &= ~S_IWUGO;
+
+ /* set inode type */
+ if ((attr & ATTR_SYSTEM) &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
+ /* no need to fix endianness on 0 */
+ if (pfindData->EndOfFile == 0)
+ inode->i_mode |= S_IFIFO;
+ else if (decode_sfu_inode(inode,
+ le64_to_cpu(pfindData->EndOfFile),
+ full_path, cifs_sb, xid))
+ cFYI(1, ("unknown SFU file type\n"));
} else {
- inode->i_mode |= S_IFREG;
- /* treat dos attribute of read-only as read-only mode eg 555 */
- if (cifsInfo->cifsAttrs & ATTR_READONLY)
- inode->i_mode &= ~(S_IWUGO);
- else if ((inode->i_mode & S_IWUGO) == 0)
- /* the ATTR_READONLY flag may have been */
- /* changed on server -- set any w bits */
- /* allowed by mnt_file_mode */
- inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
- /* BB add code to validate if device or weird share or device type? */
+ if (attr & ATTR_DIRECTORY)
+ inode->i_mode |= S_IFDIR;
+ else
+ inode->i_mode |= S_IFREG;
}
spin_lock(&inode->i_lock);
@@ -1019,8 +1015,11 @@ mkdir_get_info:
CIFS_MOUNT_MAP_SPECIAL_CHR);
}
if (direntry->d_inode) {
- direntry->d_inode->i_mode = mode;
- direntry->d_inode->i_mode |= S_IFDIR;
+ if (cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_DYNPERM)
+ direntry->d_inode->i_mode =
+ (mode | S_IFDIR);
+
if (cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_SET_UID) {
direntry->d_inode->i_uid =
@@ -1547,13 +1546,26 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
} else
goto cifs_setattr_exit;
}
- if (attrs->ia_valid & ATTR_UID) {
- cFYI(1, ("UID changed to %d", attrs->ia_uid));
- uid = attrs->ia_uid;
- }
- if (attrs->ia_valid & ATTR_GID) {
- cFYI(1, ("GID changed to %d", attrs->ia_gid));
- gid = attrs->ia_gid;
+
+ /*
+ * Without unix extensions we can't send ownership changes to the
+ * server, so silently ignore them. This is consistent with how
+ * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
+ * CIFSACL support + proper Windows to Unix idmapping, we may be
+ * able to support this in the future.
+ */
+ if (!pTcon->unix_ext &&
+ !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+ attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
+ } else {
+ if (attrs->ia_valid & ATTR_UID) {
+ cFYI(1, ("UID changed to %d", attrs->ia_uid));
+ uid = attrs->ia_uid;
+ }
+ if (attrs->ia_valid & ATTR_GID) {
+ cFYI(1, ("GID changed to %d", attrs->ia_gid));
+ gid = attrs->ia_gid;
+ }
}
time_buf.Attributes = 0;
@@ -1563,7 +1575,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
attrs->ia_valid &= ~ATTR_MODE;
if (attrs->ia_valid & ATTR_MODE) {
- cFYI(1, ("Mode changed to 0x%x", attrs->ia_mode));
+ cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
mode = attrs->ia_mode;
}
@@ -1578,18 +1590,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
#ifdef CONFIG_CIFS_EXPERIMENTAL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
rc = mode_to_acl(inode, full_path, mode);
- else if ((mode & S_IWUGO) == 0) {
-#else
- if ((mode & S_IWUGO) == 0) {
+ else
#endif
- /* not writeable */
- if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
- set_dosattr = true;
- time_buf.Attributes =
- cpu_to_le32(cifsInode->cifsAttrs |
- ATTR_READONLY);
- }
- } else if (cifsInode->cifsAttrs & ATTR_READONLY) {
+ if (((mode & S_IWUGO) == 0) &&
+ (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+ set_dosattr = true;
+ time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
+ ATTR_READONLY);
+ /* fix up mode if we're not using dynperm */
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+ attrs->ia_mode = inode->i_mode & ~S_IWUGO;
+ } else if ((mode & S_IWUGO) &&
+ (cifsInode->cifsAttrs & ATTR_READONLY)) {
/* If file is readonly on server, we would
not be able to write to it - so if any write
bit is enabled for user or group or other we
@@ -1600,6 +1612,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
/* Windows ignores set to zero */
if (time_buf.Attributes == 0)
time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
+
+ /* reset local inode permissions to normal */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+ attrs->ia_mode &= ~(S_IALLUGO);
+ if (S_ISDIR(inode->i_mode))
+ attrs->ia_mode |=
+ cifs_sb->mnt_dir_mode;
+ else
+ attrs->ia_mode |=
+ cifs_sb->mnt_file_mode;
+ }
+ } else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+ /* ignore mode change - ATTR_READONLY hasn't changed */
+ attrs->ia_valid &= ~ATTR_MODE;
}
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1d69b8014e0..4b17f8fe315 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -519,8 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
pnotify = (struct file_notify_information *)
((char *)&pSMBr->hdr.Protocol + data_offset);
cFYI(1, ("dnotify on %s Action: 0x%x",
- pnotify->FileName,
- pnotify->Action)); /* BB removeme BB */
+ pnotify->FileName, pnotify->Action));
/* cifs_dump_mem("Rcvd notify Data: ",buf,
sizeof(struct smb_hdr)+60); */
return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 713c2511019..83f30695488 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -132,6 +132,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
__u32 attr;
__u64 allocation_size;
__u64 end_of_file;
+ umode_t default_mode;
/* save mtime and size */
local_mtime = tmp_inode->i_mtime;
@@ -187,48 +188,54 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
if (atomic_read(&cifsInfo->inUse) == 0) {
tmp_inode->i_uid = cifs_sb->mnt_uid;
tmp_inode->i_gid = cifs_sb->mnt_gid;
- /* set default mode. will override for dirs below */
- tmp_inode->i_mode = cifs_sb->mnt_file_mode;
- } else {
- /* mask off the type bits since it gets set
- below and we do not want to get two type
- bits set */
+ }
+
+ if (attr & ATTR_DIRECTORY)
+ default_mode = cifs_sb->mnt_dir_mode;
+ else
+ default_mode = cifs_sb->mnt_file_mode;
+
+ /* set initial permissions */
+ if ((atomic_read(&cifsInfo->inUse) == 0) ||
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+ tmp_inode->i_mode = default_mode;
+ else {
+ /* just reenable write bits if !ATTR_READONLY */
+ if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
+ (attr & ATTR_READONLY) == 0)
+ tmp_inode->i_mode |= (S_IWUGO & default_mode);
+
tmp_inode->i_mode &= ~S_IFMT;
}
- if (attr & ATTR_DIRECTORY) {
- *pobject_type = DT_DIR;
- /* override default perms since we do not lock dirs */
- if (atomic_read(&cifsInfo->inUse) == 0)
- tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
- tmp_inode->i_mode |= S_IFDIR;
- } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
- (attr & ATTR_SYSTEM)) {
+ /* clear write bits if ATTR_READONLY is set */
+ if (attr & ATTR_READONLY)
+ tmp_inode->i_mode &= ~S_IWUGO;
+
+ /* set inode type */
+ if ((attr & ATTR_SYSTEM) &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
if (end_of_file == 0) {
- *pobject_type = DT_FIFO;
tmp_inode->i_mode |= S_IFIFO;
+ *pobject_type = DT_FIFO;
} else {
- /* rather than get the type here, we mark the
- inode as needing revalidate and get the real type
- (blk vs chr vs. symlink) later ie in lookup */
- *pobject_type = DT_REG;
+ /*
+ * trying to get the type can be slow, so just call
+ * this a regular file for now, and mark for reval
+ */
tmp_inode->i_mode |= S_IFREG;
+ *pobject_type = DT_REG;
cifsInfo->time = 0;
}
-/* we no longer mark these because we could not follow them */
-/* } else if (attr & ATTR_REPARSE) {
- *pobject_type = DT_LNK;
- tmp_inode->i_mode |= S_IFLNK; */
} else {
- *pobject_type = DT_REG;
- tmp_inode->i_mode |= S_IFREG;
- if (attr & ATTR_READONLY)
- tmp_inode->i_mode &= ~(S_IWUGO);
- else if ((tmp_inode->i_mode & S_IWUGO) == 0)
- /* the ATTR_READONLY flag may have been changed on */
- /* server -- set any w bits allowed by mnt_file_mode */
- tmp_inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
- } /* could add code here - to validate if device or weird share type? */
+ if (attr & ATTR_DIRECTORY) {
+ tmp_inode->i_mode |= S_IFDIR;
+ *pobject_type = DT_DIR;
+ } else {
+ tmp_inode->i_mode |= S_IFREG;
+ *pobject_type = DT_REG;
+ }
+ }
/* can not fill in nlink here as in qpathinfo version and Unx search */
if (atomic_read(&cifsInfo->inUse) == 0)
@@ -675,8 +682,6 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
cifsFile->invalidHandle = true;
CIFSFindClose(xid, pTcon, cifsFile->netfid);
}
- kfree(cifsFile->search_resume_name);
- cifsFile->search_resume_name = NULL;
if (cifsFile->srch_inf.ntwrk_buf_start) {
cFYI(1, ("freeing SMB ff cache buf on search rewind"));
if (cifsFile->srch_inf.smallBuf)
@@ -1043,9 +1048,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
} /* else {
cifsFile->invalidHandle = true;
CIFSFindClose(xid, pTcon, cifsFile->netfid);
- }
- kfree(cifsFile->search_resume_name);
- cifsFile->search_resume_name = NULL; */
+ } */
rc = find_cifs_entry(xid, pTcon, file,
&current_entry, &num_to_fill);
diff --git a/fs/dcache.c b/fs/dcache.c
index 3ee588d5f58..6068c25b393 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,6 +17,7 @@
#include <linux/syscalls.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
@@ -106,9 +107,10 @@ static void dentry_lru_remove(struct dentry *dentry)
/*
* Release the dentry's inode, using the filesystem
* d_iput() operation if defined.
- * Called with dcache_lock and per dentry lock held, drops both.
*/
static void dentry_iput(struct dentry * dentry)
+ __releases(dentry->d_lock)
+ __releases(dcache_lock)
{
struct inode *inode = dentry->d_inode;
if (inode) {
@@ -132,12 +134,13 @@ static void dentry_iput(struct dentry * dentry)
* d_kill - kill dentry and return parent
* @dentry: dentry to kill
*
- * Called with dcache_lock and d_lock, releases both. The dentry must
- * already be unhashed and removed from the LRU.
+ * The dentry must already be unhashed and removed from the LRU.
*
* If this is the root of the dentry tree, return NULL.
*/
static struct dentry *d_kill(struct dentry *dentry)
+ __releases(dentry->d_lock)
+ __releases(dcache_lock)
{
struct dentry *parent;
@@ -383,11 +386,11 @@ restart:
* Try to prune ancestors as well. This is necessary to prevent
* quadratic behavior of shrink_dcache_parent(), but is also expected
* to be beneficial in reducing dentry cache fragmentation.
- *
- * Called with dcache_lock, drops it and then regains.
- * Called with dentry->d_lock held, drops it.
*/
static void prune_one_dentry(struct dentry * dentry)
+ __releases(dentry->d_lock)
+ __releases(dcache_lock)
+ __acquires(dcache_lock)
{
__d_drop(dentry);
dentry = d_kill(dentry);
@@ -1604,10 +1607,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
- *
- * On return, dcache_lock will have been unlocked.
*/
static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+ __releases(dcache_lock)
{
struct mutex *m1 = NULL, *m2 = NULL;
struct dentry *ret;
@@ -1743,11 +1745,9 @@ out_nolock:
shouldnt_be_hashed:
spin_unlock(&dcache_lock);
BUG();
- goto shouldnt_be_hashed;
}
-static int prepend(char **buffer, int *buflen, const char *str,
- int namelen)
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
{
*buflen -= namelen;
if (*buflen < 0)
@@ -1757,8 +1757,13 @@ static int prepend(char **buffer, int *buflen, const char *str,
return 0;
}
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+ return prepend(buffer, buflen, name->name, name->len);
+}
+
/**
- * d_path - return the path of a dentry
+ * __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry (may be modified by this function)
* @buffer: buffer to return value in
@@ -1779,9 +1784,10 @@ char *__d_path(const struct path *path, struct path *root,
{
struct dentry *dentry = path->dentry;
struct vfsmount *vfsmnt = path->mnt;
- char * end = buffer+buflen;
- char * retval;
+ char *end = buffer + buflen;
+ char *retval;
+ spin_lock(&vfsmount_lock);
prepend(&end, &buflen, "\0", 1);
if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
(prepend(&end, &buflen, " (deleted)", 10) != 0))
@@ -1800,38 +1806,37 @@ char *__d_path(const struct path *path, struct path *root,
break;
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
/* Global root? */
- spin_lock(&vfsmount_lock);
if (vfsmnt->mnt_parent == vfsmnt) {
- spin_unlock(&vfsmount_lock);
goto global_root;
}
dentry = vfsmnt->mnt_mountpoint;
vfsmnt = vfsmnt->mnt_parent;
- spin_unlock(&vfsmount_lock);
continue;
}
parent = dentry->d_parent;
prefetch(parent);
- if ((prepend(&end, &buflen, dentry->d_name.name,
- dentry->d_name.len) != 0) ||
+ if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
(prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
retval = end;
dentry = parent;
}
+out:
+ spin_unlock(&vfsmount_lock);
return retval;
global_root:
retval += 1; /* hit the slash */
- if (prepend(&retval, &buflen, dentry->d_name.name,
- dentry->d_name.len) != 0)
+ if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
goto Elong;
root->mnt = vfsmnt;
root->dentry = dentry;
- return retval;
+ goto out;
+
Elong:
- return ERR_PTR(-ENAMETOOLONG);
+ retval = ERR_PTR(-ENAMETOOLONG);
+ goto out;
}
/**
@@ -1845,9 +1850,9 @@ Elong:
*
* Returns the buffer or an error code if the path was too long.
*
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
*/
-char *d_path(struct path *path, char *buf, int buflen)
+char *d_path(const struct path *path, char *buf, int buflen)
{
char *res;
struct path root;
@@ -1915,16 +1920,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
retval = end-1;
*retval = '/';
- for (;;) {
- struct dentry *parent;
- if (IS_ROOT(dentry))
- break;
+ while (!IS_ROOT(dentry)) {
+ struct dentry *parent = dentry->d_parent;
- parent = dentry->d_parent;
prefetch(parent);
-
- if ((prepend(&end, &buflen, dentry->d_name.name,
- dentry->d_name.len) != 0) ||
+ if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
(prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
@@ -1975,7 +1975,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
error = -ENOENT;
/* Has the current directory has been unlinked? */
spin_lock(&dcache_lock);
- if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
+ if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
unsigned long len;
struct path tmp = root;
char * cwd;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33..f976f303c19 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
#include <linux/poll.h>
#include <linux/signal.h>
#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
#include <linux/dlm.h>
#include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
struct dlm_user_proc *proc;
struct dlm_ls *ls;
+ lock_kernel();
ls = dlm_find_lockspace_device(iminor(inode));
- if (!ls)
+ if (!ls) {
+ unlock_kernel();
return -ENOENT;
+ }
proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
if (!proc) {
dlm_put_lockspace(ls);
+ unlock_kernel();
return -ENOMEM;
}
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
spin_lock_init(&proc->locks_spin);
init_waitqueue_head(&proc->wait);
file->private_data = proc;
+ unlock_kernel();
return 0;
}
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
static int ctl_device_open(struct inode *inode, struct file *file)
{
+ cycle_kernel_lock();
file->private_data = NULL;
return 0;
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cd62d75b2cc..e2832bc7869 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1906,9 +1906,9 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
goto out;
}
}
- mutex_unlock(&key_tfm_list_mutex);
(*tfm) = key_tfm->key_tfm;
(*tfm_mutex) = &key_tfm->key_tfm_mutex;
out:
+ mutex_unlock(&key_tfm_list_mutex);
return rc;
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 951ee33a022..c15c25745e0 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -660,8 +660,6 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
struct ecryptfs_auth_tok **auth_tok,
char *sig);
-int ecryptfs_write_zeros(struct file *file, pgoff_t index, int start,
- int num_zeros);
int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size);
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a..24749bf0668 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
#include "ecryptfs_kernel.h"
/**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
int rc = 0;
struct file *lower_file = NULL;
+ lock_kernel();
lower_file = ecryptfs_file_to_lower(file);
if (lower_file->f_op && lower_file->f_op->fasync)
rc = lower_file->f_op->fasync(fd, lower_file, flag);
+ unlock_kernel();
return rc;
}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 50c994a249a..09a4522f65e 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
int rc;
atomic_set(&ecryptfs_num_miscdev_opens, 0);
- mutex_lock(&ecryptfs_daemon_hash_mux);
rc = misc_register(&ecryptfs_miscdev);
if (rc)
printk(KERN_ERR "%s: Failed to register miscellaneous device "
"for communications with userspace daemons; rc = [%d]\n",
__func__, rc);
- mutex_unlock(&ecryptfs_daemon_hash_mux);
return rc;
}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index ebf55150be5..75c2ea9fee3 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -157,20 +157,6 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
ecryptfs_page_idx, rc);
goto out;
}
- if (start_offset_in_page) {
- /* Read in the page from the lower
- * into the eCryptfs inode page cache,
- * decrypting */
- rc = ecryptfs_decrypt_page(ecryptfs_page);
- if (rc) {
- printk(KERN_ERR "%s: Error decrypting "
- "page; rc = [%d]\n",
- __func__, rc);
- ClearPageUptodate(ecryptfs_page);
- page_cache_release(ecryptfs_page);
- goto out;
- }
- }
ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
/*
@@ -349,14 +335,6 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
ecryptfs_page_idx, rc);
goto out;
}
- rc = ecryptfs_decrypt_page(ecryptfs_page);
- if (rc) {
- printk(KERN_ERR "%s: Error decrypting "
- "page; rc = [%d]\n", __func__, rc);
- ClearPageUptodate(ecryptfs_page);
- page_cache_release(ecryptfs_page);
- goto out;
- }
ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
memcpy((data + data_offset),
((char *)ecryptfs_page_virt + start_offset_in_page),
diff --git a/fs/exec.c b/fs/exec.c
index 3c2ba7ce11d..fd9234379e8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,7 +26,6 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mman.h>
-#include <linux/a.out.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/smp_lock.h>
@@ -61,6 +60,11 @@
#include <linux/kmod.h>
#endif
+#ifdef __alpha__
+/* for /sbin/loader handling in search_binary_handler() */
+#include <linux/a.out.h>
+#endif
+
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
int suid_dumpable = 0;
@@ -606,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
bprm->exec -= stack_shift;
down_write(&mm->mmap_sem);
- vm_flags = vma->vm_flags;
+ vm_flags = VM_STACK_FLAGS;
/*
* Adjust stack execute permissions; explicitly enable for
@@ -860,6 +864,7 @@ static int de_thread(struct task_struct *tsk)
no_thread_group:
exit_itimers(sig);
+ flush_itimer_signals();
if (leader)
release_task(leader);
@@ -1154,7 +1159,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
{
int try,retval;
struct linux_binfmt *fmt;
-#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
+#ifdef __alpha__
/* handle /sbin/loader.. */
{
struct exec * eh = (struct exec *) bprm->buf;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 28cfd0b4052..77278e947e9 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -580,7 +580,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
- data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
+ data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
+ EXT3_ADDR_PER_BLOCK(sb));
end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
/* Get each reserved primary GDT block and verify it holds backups */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ad..2845425077e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off+len-towrite) {
i_size_write(inode, off+len-towrite);
EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 30494c5da84..495ab21b983 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -43,6 +43,46 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
+static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+ ext4_group_t block_group)
+{
+ ext4_group_t actual_group;
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
+ if (actual_group == block_group)
+ return 1;
+ return 0;
+}
+
+static int ext4_group_used_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ ext4_fsblk_t tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ /* block bitmap, inode bitmap, and inode table blocks */
+ int used_blocks = sbi->s_itb_per_group + 2;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ struct ext4_group_desc *gdp;
+ struct buffer_head *bh;
+
+ gdp = ext4_get_group_desc(sb, block_group, &bh);
+ if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
+ block_group))
+ used_blocks--;
+
+ if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
+ block_group))
+ used_blocks--;
+
+ tmp = ext4_inode_table(sb, gdp);
+ for (; tmp < ext4_inode_table(sb, gdp) +
+ sbi->s_itb_per_group; tmp++) {
+ if (!ext4_block_in_group(sb, tmp, block_group))
+ used_blocks -= 1;
+ }
+ }
+ return used_blocks;
+}
/* Initializes an uninitialized block bitmap if given, and returns the
* number of blocks free in the group. */
unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -81,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -105,20 +140,34 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
free_blocks = group_blocks - bit_max;
if (bh) {
- ext4_fsblk_t start;
+ ext4_fsblk_t start, tmp;
+ int flex_bg = 0;
for (bit = 0; bit < bit_max; bit++)
ext4_set_bit(bit, bh->b_data);
start = ext4_group_first_block_no(sb, block_group);
- /* Set bits for block and inode bitmaps, and inode table */
- ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
- ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
- for (bit = (ext4_inode_table(sb, gdp) - start),
- bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
- ext4_set_bit(bit, bh->b_data);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ flex_bg = 1;
+ /* Set bits for block and inode bitmaps, and inode table */
+ tmp = ext4_block_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(tmp - start, bh->b_data);
+
+ tmp = ext4_inode_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(tmp - start, bh->b_data);
+
+ tmp = ext4_inode_table(sb, gdp);
+ for (; tmp < ext4_inode_table(sb, gdp) +
+ sbi->s_itb_per_group; tmp++) {
+ if (!flex_bg ||
+ ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(tmp - start, bh->b_data);
+ }
/*
* Also if the number of blocks within the group is
* less than the blocksize * 8 ( which is the size
@@ -126,8 +175,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
}
-
- return free_blocks - sbi->s_itb_per_group - 2;
+ return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
}
@@ -242,7 +290,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -252,7 +300,7 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -356,8 +404,7 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __func__)
@@ -641,7 +688,7 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -757,6 +804,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1545,23 +1599,35 @@ out:
/**
* ext4_has_free_blocks()
- * @sbi: in-core super block structure.
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks, root_blocks;
+ ext4_fsblk_t free_blocks;
+ ext4_fsblk_t root_blocks = 0;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+ if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ if (free_blocks - root_blocks < FBC_BATCH)
+ free_blocks =
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+ if (free_blocks - root_blocks < nblocks)
+ return free_blocks - root_blocks;
+ return nblocks;
+ }
+
/**
* ext4_should_retry_alloc()
@@ -1577,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1586,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1623,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_group_t ngroups;
unsigned long num = *count;
- *errp = -ENOSPC;
sb = inode->i_sb;
if (!sb) {
+ *errp = -ENODEV;
printk("ext4_new_block: nonexistent device");
return 0;
}
+ sbi = EXT4_SB(sb);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ *count = ext4_has_free_blocks(sbi, *count);
+ }
+ if (*count == 0) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
+ num = *count;
+
/*
* Check quota for allocation of this block.
*/
@@ -1653,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
/*
* First, test whether the goal block is free.
*/
@@ -1681,7 +1759,7 @@ retry_alloc:
my_rsv = NULL;
if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1717,7 +1795,7 @@ retry_alloc:
continue;
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
/*
@@ -1829,7 +1907,15 @@ allocated:
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1862,46 +1948,104 @@ out:
return 0;
}
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1933,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea19..d3d23d73c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac..303e41cf7b1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -45,7 +45,7 @@
#define ext4_debug(f, a...) \
do { \
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
+ __FILE__, __LINE__, __func__); \
printk (KERN_DEBUG f, ## a); \
} while (0)
#else
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+ ext4_grpblk_t add);
/* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode (struct inode *, int);
extern int ext4_setattr (struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
extern void ext4_delete_inode (struct inode *);
extern int ext4_sync_inode (handle_t *, struct inode *);
extern void ext4_discard_reservation (struct inode *);
extern void ext4_dirty_inode(struct inode *);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate (struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
+ __ext4_std_error((sb), __func__, (errno)); \
} while (0)
/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned long max_blocks,
struct buffer_head *bh, int create,
- int extend_disksize);
+ int extend_disksize, int flag);
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fa..6c166c0a54b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d7..ef7409f0e7e 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
};
/*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
+ struct jbd2_inode jinode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+
+ /* allocation reservation info for delalloc */
+ unsigned long i_reserved_data_blocks;
+ unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};
#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b2..eb8bc3afe6e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+ __ext4_journal_forget(__func__, (handle), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
}
#define ext4_journal_stop(handle) \
- __ext4_journal_stop(__FUNCTION__, (handle))
+ __ext4_journal_stop(__func__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal);
}
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f219..6300226d553 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
#include <linux/rbtree.h>
/*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
};
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3da..42c4c0c892e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
-
- return handle;
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err)
+ return err;
+ return ext4_journal_restart(handle, needed);
}
/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
return bg_start + colour + block;
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+ int newextents = blocks;
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ /* number of new leaf blocks needed */
+ num = leafs = (newextents + lcap - 1) / lcap;
+
+ /*
+ * Worse case, we need separate index block(s)
+ * to link all new leaf blocks
+ */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;
@@ -981,6 +1017,8 @@ repeat:
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
#endif
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_journal_restart(handle, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
+ loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (allocated > max_blocks)
allocated = max_blocks;
- /* mark the buffer unwritten */
- __set_bit(BH_Unwritten, &bh_result->b_state);
+ set_buffer_unwritten(bh_result);
goto out2;
}
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (extend_disksize) {
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ }
+
+ set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
+ set_buffer_mapped(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
@@ -2744,7 +2785,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
*/
err = ext4_writepage_trans_blocks(inode) + 3;
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
handle->h_sync = 1;
out_stop:
+ up_write(&EXT4_I(inode)->i_data_sem);
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- up_write(&EXT4_I(inode)->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
}
ret = ext4_get_blocks_wrap(handle, inode, block,
max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366a..430eb7978db 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
return ret;
}
+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr,
+ .getattr = ext4_getattr,
#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8..a45c3737ad3 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
+#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
}
out:
return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7ee..c2c0a8d06d0 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c80..a92eb305344 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+ ext4_group_t *best_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = parent_fbg_group;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flexbg_free_blocks;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flexbg_free_blocks = flex_group[i].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ goto found_flexbg;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+ if (!flex_group[best_flex].free_inodes ||
+ !flex_group[best_flex].free_blocks)
+ return -1;
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ *best_group = i;
+ goto out;
+ }
+ }
+
+ return -1;
+out:
+ return 0;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -600,7 +689,7 @@ got:
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
BUFFER_TRACE(block_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
goto fail_free_drop;
if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for diretory, file and normal symlink*/
+ /* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail_free_drop;
}
}
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d970774641..8ca2763df09 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
int target, i;
- unsigned long count = 0;
+ unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
- target = blks + indirect_blks;
-
- while (1) {
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ current_block = ext4_new_meta_blocks(handle, inode,
+ goal, &count, err);
if (*err)
goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
new_blocks[index++] = current_block++;
count--;
}
-
- if (count > 0)
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
break;
+ }
}
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ count = target;
+ /* allocating blocks for data blocks */
+ current_block = ext4_new_blocks(handle, inode, iblock,
+ goal, &count, err);
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += count;
+ }
+allocated:
/* total number of blocks allocated for direct blocks */
- ret = count;
+ ret = blk_allocated;
*err = 0;
return ret;
failed_out:
@@ -584,8 +631,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
+ loff_t disksize;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* protect it if you're about to implement concurrent
* ext4_get_block() -bzzz
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!err && extend_disksize) {
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > ei->i_disksize)
+ ei->i_disksize = disksize;
+ }
if (err)
goto cleanup;
@@ -934,7 +989,7 @@ out:
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+ int create, int extend_disksize, int flag)
{
int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flag)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
~EXT4_EXT_MIGRATE;
}
}
+
+ if (flag) {
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ /*
+ * Update reserved blocks/metadata blocks
+ * after successful block allocation
+ * which were deferred till now
+ */
+ if ((retval > 0) && buffer_delay(bh))
+ ext4_da_release_space(inode, retval, 0);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
}
ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create, 0, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create, 1, 0);
/*
* ext4_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
}
if (ret) {
- ext4_journal_stop(handle);
unlock_page(page);
+ ext4_journal_stop(handle);
page_cache_release(page);
}
@@ -1236,15 +1313,6 @@ out:
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);
return ret ? ret : copied;
}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ind_blks, dind_blks, tind_blks;
+
+ /* number of new indirect blocks needed */
+ ind_blks = (blocks + icap - 1) / icap;
+
+ dind_blks = (ind_blks + icap - 1) / icap;
+
+ tind_blks = 1;
+
+ return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_calc_metadata_amount(inode, blocks);
+
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+ mdblocks = ext4_calc_metadata_amount(inode, total);
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+ total = md_needed + nrblocks;
+
+ if (ext4_has_free_blocks(sbi, total) < total) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free, release;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ release = to_free + mdb_free;
+
+ /* update fs free blocks counter for truncate case */
+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+ /* update per-inode reservations */
+ BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+ ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct buffer_head lbh; /* extent of blocks */
+ unsigned long first_page, next_page; /* extent of pages */
+ get_block_t *get_block;
+ struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct mpage_data mpd_pp = {
+ .bio = NULL,
+ .last_block_in_bio = 0,
+ .get_block = mpd->get_block,
+ .use_writepage = 1,
+ };
+ int ret = 0, err, nr_pages, i;
+ unsigned long index, end;
+ struct pagevec pvec;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+
+ pagevec_init(&pvec, 0);
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ * XXX: unlock and re-dirty them?
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ if (mpd_pp.bio)
+ mpage_bio_submit(WRITE, mpd_pp.bio);
+
+ return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+ struct buffer_head *exbh)
+{
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ int blocks = exbh->b_size >> inode->i_blkbits;
+ sector_t pblock = exbh->b_blocknr, cur_logical;
+ struct buffer_head *head, *bh;
+ unsigned long index, end;
+ struct pagevec pvec;
+ int nr_pages, i;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(!page_has_buffers(page));
+
+ bh = page_buffers(page);
+ head = bh;
+
+ /* skip blocks out of the range */
+ do {
+ if (cur_logical >= logical)
+ break;
+ cur_logical++;
+ } while ((bh = bh->b_this_page) != head);
+
+ do {
+ if (cur_logical >= logical + blocks)
+ break;
+ if (buffer_delay(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_delay(bh);
+ } else if (buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+
+ cur_logical++;
+ pblock++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int blocks, i;
+
+ blocks = bh->b_size >> inode->i_blkbits;
+ for (i = 0; i < blocks; i++)
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ int err = 0, remain = lbh->b_size;
+ sector_t next = lbh->b_blocknr;
+ struct buffer_head new;
+
+ /*
+ * We consider only non-mapped and non-allocated blocks
+ */
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ return;
+
+ while (remain) {
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = remain;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * Rather than implement own error handling
+ * here, we just leave remaining blocks
+ * unallocated and try again with ->writepage()
+ */
+ break;
+ }
+ BUG_ON(new.b_size == 0);
+
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
+
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
+
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, struct buffer_head *bh)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ sector_t next;
+
+ next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+ /*
+ * First block in the extent
+ */
+ if (lbh->b_size == 0) {
+ lbh->b_blocknr = logical;
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ return;
+ }
+
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+ lbh->b_size += bh->b_size;
+ return;
+ }
+
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ mpage_da_map_blocks(mpd);
+
+ /*
+ * Now start a new extent
+ */
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct mpage_da_data *mpd = data;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *bh, *head, fake;
+ sector_t logical;
+
+ /*
+ * Can we merge this page to current extent?
+ */
+ if (mpd->next_page != page->index) {
+ /*
+ * Nope, we can't. So, we map non-allocated blocks
+ * and start IO on them using __mpage_writepage()
+ */
+ if (mpd->next_page != mpd->first_page) {
+ mpage_da_map_blocks(mpd);
+ mpage_da_submit_io(mpd);
+ }
+
+ /*
+ * Start next extent of pages ...
+ */
+ mpd->first_page = page->index;
+
+ /*
+ * ... and blocks
+ */
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ }
+
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ /*
+ * There is no attached buffer heads yet (mmap?)
+ * we treat the page asfull of dirty blocks
+ */
+ bh = &fake;
+ bh->b_size = PAGE_CACHE_SIZE;
+ bh->b_state = 0;
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ } else {
+ /*
+ * Page with regular buffer heads, just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ if (buffer_dirty(bh))
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ get_block_t get_block)
+{
+ struct mpage_da_data mpd;
+ int ret;
+
+ if (!get_block)
+ return generic_writepages(mapping, wbc);
+
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+ mpd.lbh.b_size = 0;
+ mpd.lbh.b_state = 0;
+ mpd.lbh.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.get_block = get_block;
+
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+ /*
+ * Handle last extent of pages
+ */
+ if (mpd.next_page != mpd.first_page) {
+ mpage_da_map_blocks(&mpd);
+ mpage_da_submit_io(&mpd);
+ }
+
+ return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ if ((ret == 0) && !buffer_delay(bh_result)) {
+ /* the block isn't (pre)allocated yet, let's reserve space */
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ } else if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+
+ return ret;
+}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ BUG_ON(!ret);
+ } else {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ }
+
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ if (EXT4_I(inode)->i_disksize == disksize) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation
+ */
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ /*
+ * we don't want to do block allocation in writepage
+ * so call get_block_wrap with create = 0
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
+ */
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ if (ext4_should_order_data(inode)) {
+ /*
+ * With ordered mode we need to add
+ * the inode to the journal handle
+ * when we do block allocation.
+ */
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i=0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied -1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
*
* Problem:
*
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page,
+ ext4_normal_get_block_write, wbc);
+ else
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+}
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+static int ext4_normal_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
-
- ret = block_write_full_page(page, ext4_get_block, wbc);
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
+ if (!ext4_journal_current_handle())
+ return __ext4_normal_writepage(page, wbc);
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;
-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (ext4_journal_current_handle())
- goto no_write;
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
+ }
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ if (ext4_journal_current_handle())
goto no_write;
- }
- if (!page_has_buffers(page) || PageChecked(page)) {
+ if (PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_ordered_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
static const struct address_space_operations ext4_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
.releasepage = ext4_releasepage,
};
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
err = ext4_journal_dirty_metadata(handle, bh);
} else {
if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext4_journal_dirty_metadata(handle, this_bh);
+ else
+ ext4_error(inode->i_sb, __func__,
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext4_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!ext4_can_truncate(inode))
return;
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
goto out_stop;
/*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+ /*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes, so it is now safe to propagate
* the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
*/
ei->i_disksize = inode->i_size;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error)
error = rc;
ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
}
rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
/*
* How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ }
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+ * inode.i__mutex here. That allow * parallel write_begin,
+ * write_end call. lock_page prevent this from happening
+ * on the same page though
+ */
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+ len, len, page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = 0;
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 873ad9b3418..8d141a25bbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (!buffer_uptodate(bh[i]))
goto out;
+ err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
int pnum;
int poff;
struct page *page;
+ int ret;
mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
{
int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+ ext4_unlock_group(sb, e4b->bd_group);
ext4_error(sb, __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %lu)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
+ ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
} while (1);
}
mb_check_buddy(e4b);
-
- return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
int rc;
int size;
+ if (unlikely(sbi->s_mb_history == NULL))
+ return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
/* if we can't allocate history, then we simple won't use it */
}
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i, len;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+ "buddy group\n");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ /*
+ * calculate needed size. if change bb_counters size,
+ * don't forget about ext4_mb_generate_buddy()
+ */
+ len = offsetof(typeof(**meta_group_info),
+ bb_counters[sb->s_blocksize_bits + 2]);
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_blocks_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+ int err;
+
+ /* Add group based on group descriptor*/
+ err = ext4_mb_add_groupinfo(sb, group, desc);
+ if (err)
+ return err;
+
+ /*
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+ * datas) are set not up to date so that they will be re-initilaized
+ * during the next call to ext4_mb_load_buddy
+ */
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+ grp->bb_free += add;
+}
+
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t i;
- int j, len, metalen;
+ int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- goto err_freebuddy;
- }
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
"EXT4-fs: can't read descriptor %lu\n", i);
- i++;
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned i;
unsigned offset;
unsigned max;
+ int ret;
if (!test_opt(sb, MBALLOC))
return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0) {
clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- return i;
+ return ret;
}
spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
+#define MB_PROC_FOPS(name) \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
-}
-
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
+ struct ext4_sb_info *sbi = m->private; \
+ \
+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
+ return 0; \
+} \
+ \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{ \
+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+} \
+ \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
+ const char __user *buf, size_t cnt, loff_t *ppos) \
{ \
- struct ext4_sb_info *sbi = data; \
+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
char str[32]; \
long value; \
if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
return -ERANGE; \
sbi->s_mb_##name = value; \
return cnt; \
-}
+} \
+ \
+static const struct file_operations ext4_mb_##name##_proc_fops = { \
+ .owner = THIS_MODULE, \
+ .open = ext4_mb_##name##_proc_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ .write = ext4_mb_##name##_proc_write, \
+};
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
#define MB_PROC_HANDLER(name, var) \
do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
+ &ext4_mb_##var##_proc_fops, sbi); \
if (proc == NULL) { \
printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
goto err_out; \
} \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
} while (0)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
struct proc_dir_entry *proc;
char devname[64];
+ if (proc_root_ext4 == NULL) {
+ sbi->s_mb_proc = NULL;
+ return -EINVAL;
+ }
bdevname(sb->s_bdev, devname);
sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2745,11 +2893,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sbi = EXT4_SB(sb);
es = sbi->s_es;
- ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
- gdp->bg_free_blocks_count);
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
@@ -2762,6 +2908,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!gdp)
goto out_err;
+ ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+ gdp->bg_free_blocks_count);
+
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
@@ -2815,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+ /*
+ * free blocks account has already be reduced/reserved
+ * at write_begin() time for delayed allocation
+ * do not double accounting
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ percpu_counter_sub(&sbi->s_freeblocks_counter,
+ ac->ac_b_ex.fe_len);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
if (err)
@@ -3094,8 +3259,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
- unsigned len = ac->ac_o_ex.fe_len;
-
+ unsigned int len = ac->ac_o_ex.fe_len;
ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
&ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
&(ar->len), errp);
return block;
}
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
+ }
+
+ if (ar->len == 0) {
+ *errp = -ENOSPC;
+ return 0;
+ }
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
inquota = ar->len;
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
- goto out;
+ goto out2;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
@@ -4085,11 +4261,12 @@ repeat:
ext4_mb_release_context(ac);
-out:
+out2:
+ kmem_cache_free(ext4_ac_cachep, ac);
+out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
} else {
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
+ mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
}
spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
ext4_mb_release_desc(&e4b);
*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830..387ad98350c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext4_next_entry(de)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext4_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
brelse (bh);
/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9f086a6a472..f000fbe2cd9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -563,7 +563,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
- data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+ data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+ EXT4_ADDR_PER_BLOCK(sb));
end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
/* Get each reserved primary GDT block and verify it holds backups */
@@ -854,7 +855,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
*/
/* Update group descriptor block for new group */
- gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+ gdp = (struct ext4_group_desc *)((char *)primary->b_data +
+ gdb_off * EXT4_DESC_SIZE(sb));
ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
@@ -864,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ if (test_opt(sb, MBALLOC)) {
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ if (err)
+ goto exit_journal;
+ }
+ /*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
* all of its blocks and inodes are already valid.
@@ -955,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
handle_t *handle;
int err;
unsigned long freed_blocks;
+ ext4_group_t group;
+ struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -986,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
ext4_warning(sb, __func__,
@@ -1058,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
+
+ /*
+ * Mark mballoc pages as not up to date so that they will be updated
+ * next time they are loaded by ext4_mb_load_buddy.
+ */
+ if (test_opt(sb, MBALLOC)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Get the info on the last group */
+ grp = ext4_get_group_info(sb, group);
+
+ /* Update free blocks in group info */
+ ext4_mb_update_group_info(grp, add);
+ }
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 09d9359c805..1cb371dcd60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode);
}
static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -729,8 +739,15 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_printf(seq, ",commit=%u",
(unsigned) (sbi->s_commit_interval / HZ));
}
- if (test_opt(sb, BARRIER))
- seq_puts(seq, ",barrier=1");
+ /*
+ * We're changing the default of barrier mount option, so
+ * let's always display its mount state so it's clear what its
+ * status is.
+ */
+ seq_puts(seq, ",barrier=");
+ seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ seq_puts(seq, ",journal_async_commit");
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
if (!test_opt(sb, EXTENTS))
@@ -739,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nomballoc");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -886,7 +906,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
};
static match_table_t tokens = {
@@ -945,6 +965,8 @@ static match_table_t tokens = {
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
{Opt_err, NULL},
};
@@ -982,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
+ ext4_fsblk_t last_block;
if (!options)
return 1;
@@ -1301,15 +1324,39 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_warning(sb, __func__,
+ "extents feature not enabled "
+ "on this filesystem, use tune2fs\n");
+ return 0;
+ }
set_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_noextents:
+ /*
+ * When e2fsprogs support resizing an already existing
+ * ext3 file system to greater than 2**32 we need to
+ * add support to block allocator to handle growing
+ * already existing block mapped inode so that blocks
+ * allocated for them fall within 2**32
+ */
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ if (last_block > 0xffffffffULL) {
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
+ "large to mount with "
+ "-o noextents options\n");
+ return 0;
+ }
clear_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
break;
+ case Opt_nodelalloc:
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ break;
case Opt_mballoc:
set_opt(sbi->s_mount_opt, MBALLOC);
break;
@@ -1323,6 +1370,9 @@ set_qf_format:
return 0;
sbi->s_stripe = option;
break;
+ case Opt_delalloc:
+ set_opt(sbi->s_mount_opt, DELALLOC);
+ break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1435,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ int groups_per_flex = 0;
+ __u64 block_bitmap = 0;
+ int i;
+
+ if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -1802,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
}
static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
{
struct buffer_head * bh;
@@ -1843,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto out_fail;
}
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
- goto out_fail;
- }
-
/*
* The ext4 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
@@ -1907,18 +2000,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
set_opt(sbi->s_mount_opt, RESERVATION);
+ set_opt(sbi->s_mount_opt, BARRIER);
/*
* turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * only if feature flag already set by mkfs or tune2fs.
+ * Use -o noextents to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ set_opt(sbi->s_mount_opt, EXTENTS);
+ else
+ ext4_warning(sb, __func__,
+ "extents feature not enabled on this filesystem, "
+ "use tune2fs.\n");
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * turn on mballoc code by default in ext4 filesystem
+ * Use -o nomballoc to turn it off
*/
set_opt(sbi->s_mount_opt, MBALLOC);
+ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
+ set_opt(sbi->s_mount_opt, DELALLOC);
+
+
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
@@ -2129,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize "
+ "flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -2189,6 +2304,29 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3;
+ if (!(sb->s_flags & MS_RDONLY) &&
+ EXT4_SB(sb)->s_journal->j_failed_commit) {
+ printk(KERN_CRIT "EXT4-fs error (device %s): "
+ "ext4_fill_super: Journal transaction "
+ "%u is corrupt\n", sb->s_id,
+ EXT4_SB(sb)->s_journal->j_failed_commit);
+ if (test_opt (sb, ERRORS_RO)) {
+ printk (KERN_CRIT
+ "Mounting filesystem read-only\n");
+ sb->s_flags |= MS_RDONLY;
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ }
+ if (test_opt(sb, ERRORS_PANIC)) {
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ ext4_commit_super(sb, es, 1);
+ printk(KERN_CRIT
+ "EXT4-fs (device %s): mount failed\n",
+ sb->s_id);
+ goto failed_mount4;
+ }
+ }
} else if (journal_inum) {
if (ext4_create_journal(sb, es, journal_inum))
goto failed_mount3;
@@ -2326,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
ext4_ext_init(sb);
ext4_mb_init(sb, needs_recovery);
@@ -2340,6 +2485,7 @@ cantfind_ext4:
failed_mount4:
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3293,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh);
else {
/* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
brelse(bh);
@@ -3305,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off+len-towrite) {
i_size_write(inode, off+len-towrite);
EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398..93c5fdcdad2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
goal, &error);
if (error)
goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cad..ac1a52cf2a3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4..d91aa61b42a 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af2..3a9ecac8d61 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
{
- return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
}
static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99a..34541d06e62 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
loff_t cpos;
int ret = 0;
- lock_kernel();
+ lock_super(sb);
cpos = filp->f_pos;
/* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
if (unicode)
__putname(unicode);
out:
- unlock_kernel();
+ unlock_super(sb);
return ret;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 27cc1164ec3..c672df4036e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
- lock_kernel();
fat_free(inode, nr_clusters);
- unlock_kernel();
fat_flush_inodes(inode->i_sb, inode, NULL);
}
@@ -257,26 +254,34 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
}
EXPORT_SYMBOL_GPL(fat_getattr);
-static int fat_check_mode(const struct msdos_sb_info *sbi, struct inode *inode,
- mode_t mode)
+static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
+ struct inode *inode, umode_t *mode_ptr)
{
- mode_t mask, req = mode & ~S_IFMT;
+ mode_t mask, perm;
- if (S_ISREG(mode))
+ /*
+ * Note, the basic check is already done by a caller of
+ * (attr->ia_mode & ~MSDOS_VALID_MODE)
+ */
+
+ if (S_ISREG(inode->i_mode))
mask = sbi->options.fs_fmask;
else
mask = sbi->options.fs_dmask;
+ perm = *mode_ptr & ~(S_IFMT | mask);
+
/*
* Of the r and x bits, all (subject to umask) must be present. Of the
* w bits, either all (subject to umask) or none must be present.
*/
- req &= ~mask;
- if ((req & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
+ if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
return -EPERM;
- if ((req & S_IWUGO) && ((req & S_IWUGO) != (S_IWUGO & ~mask)))
+ if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
return -EPERM;
+ *mode_ptr &= S_IFMT | perm;
+
return 0;
}
@@ -299,11 +304,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
{
struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
struct inode *inode = dentry->d_inode;
- int mask, error = 0;
+ int error = 0;
unsigned int ia_valid;
- lock_kernel();
-
/*
* Expand the file. Since inode_setattr() updates ->i_size
* before calling the ->truncate(), but FAT needs to fill the
@@ -332,12 +335,13 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
error = 0;
goto out;
}
+
if (((attr->ia_valid & ATTR_UID) &&
(attr->ia_uid != sbi->options.fs_uid)) ||
((attr->ia_valid & ATTR_GID) &&
(attr->ia_gid != sbi->options.fs_gid)) ||
((attr->ia_valid & ATTR_MODE) &&
- fat_check_mode(sbi, inode, attr->ia_mode) < 0))
+ (attr->ia_mode & ~MSDOS_VALID_MODE)))
error = -EPERM;
if (error) {
@@ -346,17 +350,17 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
goto out;
}
- error = inode_setattr(inode, attr);
- if (error)
- goto out;
+ /*
+ * We don't return -EPERM here. Yes, strange, but this is too
+ * old behavior.
+ */
+ if (attr->ia_valid & ATTR_MODE) {
+ if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+ attr->ia_valid &= ~ATTR_MODE;
+ }
- if (S_ISDIR(inode->i_mode))
- mask = sbi->options.fs_dmask;
- else
- mask = sbi->options.fs_fmask;
- inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask);
+ error = inode_setattr(inode, attr);
out:
- unlock_kernel();
return error;
}
EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d67..46a4508ffd2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
static void fat_clear_inode(struct inode *inode)
{
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
- lock_kernel();
spin_lock(&sbi->inode_hash_lock);
fat_cache_inval_inode(inode);
hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
spin_unlock(&sbi->inode_hash_lock);
- unlock_kernel();
}
static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
return 0;
- lock_kernel();
+ lock_super(sb);
bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
if (!bh) {
printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
if (i_pos != MSDOS_I(inode)->i_pos) {
spin_unlock(&sbi->inode_hash_lock);
brelse(bh);
- unlock_kernel();
+ unlock_super(sb);
goto retry;
}
@@ -606,7 +605,7 @@ retry:
err = sync_dirty_buffer(bh);
brelse(bh);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
static struct dentry *fat_get_parent(struct dentry *child)
{
+ struct super_block *sb = child->d_sb;
struct buffer_head *bh;
struct msdos_dir_entry *de;
loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
struct inode *inode;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
if (err) {
parent = ERR_PTR(err);
goto out;
}
- inode = fat_build_inode(child->d_sb, de, i_pos);
+ inode = fat_build_inode(sb, de, i_pos);
brelse(bh);
if (IS_ERR(inode)) {
parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
parent = ERR_PTR(-ENOMEM);
}
out:
- unlock_kernel();
+ unlock_super(sb);
return parent;
}
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
long error;
char buf[50];
+ /*
+ * GFP_KERNEL is ok here, because while we do hold the
+ * supeblock lock, memory pressure can't call back into
+ * the filesystem, since we're only just about to mount
+ * it and have no inodes etc active!
+ */
sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a7..330a7d78259 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (error)
return error;
- lock_kernel();
if ((arg ^ filp->f_flags) & FASYNC) {
if (filp->f_op && filp->f_op->fasync) {
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
out:
- unlock_kernel();
return error;
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fb77e096213..3141690558c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -488,7 +488,12 @@ static struct fuse_conn *new_conn(struct super_block *sb)
err = bdi_init(&fc->bdi);
if (err)
goto error_kfree;
- err = bdi_register_dev(&fc->bdi, fc->dev);
+ if (sb->s_bdev) {
+ err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+ MAJOR(fc->dev), MINOR(fc->dev));
+ } else {
+ err = bdi_register_dev(&fc->bdi, fc->dev);
+ }
if (err)
goto error_bdi_destroy;
/*
@@ -586,7 +591,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
- fc->max_write = min_t(unsigned, 4096, fc->max_write);
+ fc->max_write = max_t(unsigned, 4096, fc->max_write);
fc->conn_init = 1;
}
fuse_put_request(fc, req);
@@ -662,7 +667,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->flags = d.flags;
fc->user_id = d.user_id;
fc->group_id = d.group_id;
- fc->max_read = min_t(unsigned, 4096, d.max_read);
+ fc->max_read = max_t(unsigned, 4096, d.max_read);
/* Used by get_root_inode() */
sb->s_fs_info = fc;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c19184f2e70..bec76b1c2bb 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
}
-static inline unsigned int zero_metapath_length(const struct metapath *mp,
- unsigned height)
+static inline unsigned int metapath_branch_start(const struct metapath *mp)
{
- unsigned int i;
- for (i = 0; i < height - 1; i++) {
- if (mp->mp_list[i] != 0)
- return i;
- }
- return height;
+ if (mp->mp_list[0] == 0)
+ return 2;
+ return 1;
}
/**
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *dibh = mp->mp_bh[0];
u64 bn, dblock = 0;
- unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+ unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
const unsigned end_of_metadata = height - 1;
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
/* Building up tree height */
state = ALLOC_GROW_HEIGHT;
iblks = height - ip->i_height;
- zmpl = zero_metapath_length(mp, height);
- iblks -= zmpl;
- iblks += height;
+ branch_start = metapath_branch_start(mp);
+ iblks += (height - branch_start);
}
}
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
sizeof(struct gfs2_meta_header));
*ptr = zero_bn;
state = ALLOC_GROW_DEPTH;
- for(i = zmpl; i < height; i++) {
+ for(i = branch_start; i < height; i++) {
if (mp->mp_bh[i] == NULL)
break;
brelse(mp->mp_bh[i]);
mp->mp_bh[i] = NULL;
}
- i = zmpl;
+ i = branch_start;
}
if (n == 0)
break;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a06..24dd5945008 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
gfs2_glock_dq_uninit(&i_gh);
}
} else
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
return error;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6387523a315..3401628d742 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ ulong_aligned:
depending on architecture. I've experimented with several ways
of writing this section such as using an else before the goto
but this one seems to be the fastest. */
- while ((unsigned char *)plong < end - 1) {
+ while ((unsigned char *)plong < end - sizeof(unsigned long)) {
prefetch(plong + 1);
if (((*plong) & LBITMASK) != lskipval)
break;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022c..91389c8aee8 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4d99685fdce..f8b3be87322 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
}
/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head *bh;
int ret;
int barrier_done = 0;
+ struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
if (JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -168,6 +158,7 @@ static int journal_submit_commit_record(journal_t *journal,
spin_unlock(&journal->j_state_lock);
/* And try again, without the barrier */
+ lock_buffer(bh);
set_buffer_uptodate(bh);
set_buffer_dirty(bh);
ret = submit_bh(WRITE, bh);
@@ -196,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
*/
-static int journal_wait_on_locked_list(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
{
- int ret = 0;
- struct journal_head *jh;
-
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- ret = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- jbd2_journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ ret = generic_writepages(mapping, &wbc);
return ret;
- }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd2_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd2_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
} else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- jbd2_journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * jbd2_journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ jinode->i_transaction = NULL;
}
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -523,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete if commit
- * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
-
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
@@ -546,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
@@ -573,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -747,15 +660,19 @@ start_journal_io:
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
-
- spin_lock(&journal->j_list_lock);
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
- spin_unlock(&journal->j_list_lock);
- if (err)
- __jbd2_journal_abort_hard(journal);
}
+ /*
+ * This is the right place to wait for data buffers both for ASYNC
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+ * the commit block went to disk (which happens above). If commit is
+ * SYNC, we need to wait for data buffers before we start writing
+ * commit block, which happens below in such setting.
+ */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err)
+ jbd2_journal_abort(journal, err);
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -767,7 +684,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -826,7 +743,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -853,7 +770,7 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -873,9 +790,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -996,7 +913,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a7..b26c6d9fe6a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ int writeout = 0;
+
+ if (!journal)
+ return;
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 5d0405a9e7c..058f50f65b7 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -344,6 +344,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
obh->b_size);
}
+ put_bh(obh);
}
return 0;
}
@@ -610,9 +611,8 @@ static int do_one_pass(journal_t *journal,
chksum_err = chksum_seen = 0;
if (info->end_transaction) {
- printk(KERN_ERR "JBD: Transaction %u "
- "found to be corrupt.\n",
- next_commit_ID - 1);
+ journal->j_failed_commit =
+ info->end_transaction;
brelse(bh);
break;
}
@@ -643,10 +643,8 @@ static int do_one_pass(journal_t *journal,
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
- printk(KERN_ERR
- "JBD: Transaction %u "
- "found to be corrupt.\n",
- next_commit_ID);
+ journal->j_failed_commit =
+ next_commit_ID;
brelse(bh);
break;
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e6780..4f7cadbb19f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
- * Called under j_state_lock
*/
static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
}
/**
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = jbd2_journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls jbd2_journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __jbd2_journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __jbd2_journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- jbd2_journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of these pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __jbd2_journal_unfile_buffer(jh);
- jbd2_journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return -EIO;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ /*
+ * First check whether inode isn't already on the transaction's
+ * lists without taking the lock. Note that this check is safe
+ * without the lock as we cannot race with somebody removing inode
+ * from the transaction. The reason is that we remove inode from the
+ * transaction only in journal_release_jbd_inode() and when we commit
+ * the transaction. We are guarded from the first case by holding
+ * a reference to the inode. We are safe against the second case
+ * because if jinode->i_transaction == transaction, commit code
+ * cannot touch the transaction because we hold reference to it,
+ * and if jinode->i_next_transaction == transaction, commit code
+ * will only file the inode where we want it.
+ */
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/libfs.c b/fs/libfs.c
index b004dfadd89..baeb71ee1cd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
mntput(mnt);
}
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
@@ -528,6 +542,37 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
return count;
}
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
+ const void *from, size_t available)
+{
+ loff_t pos = *ppos;
+
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= available)
+ return 0;
+ if (count > available - pos)
+ count = available - pos;
+ memcpy(to, from + pos, count);
+ *ppos = pos + count;
+
+ return count;
+}
+
/*
* Transaction based IO.
* The file expects a single write which triggers the transaction, and then
@@ -800,6 +845,7 @@ EXPORT_SYMBOL(simple_statfs);
EXPORT_SYMBOL(simple_sync_file);
EXPORT_SYMBOL(simple_unlink);
EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(memory_read_from_buffer);
EXPORT_SYMBOL(simple_transaction_get);
EXPORT_SYMBOL(simple_transaction_read);
EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/locks.c b/fs/locks.c
index 11dbf08651b..dce8c747371 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
/* insert into file's list */
fl->fl_next = *pos;
*pos = fl;
-
- if (fl->fl_ops && fl->fl_ops->fl_insert)
- fl->fl_ops->fl_insert(fl);
}
/*
@@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
fl->fl_fasync = NULL;
}
- if (fl->fl_ops && fl->fl_ops->fl_remove)
- fl->fl_ops->fl_remove(fl);
-
if (fl->fl_nspid) {
put_pid(fl->fl_nspid);
fl->fl_nspid = NULL;
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a..dbcc7af76a1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
+EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-struct mpage_data {
- struct bio *bio;
- sector_t last_block_in_bio;
- get_block_t *get_block;
- unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
mpd->bio = bio;
return ret;
}
+EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d702..1f7f2956412 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
dentry->d_op = &msdos_dentry_operations;
- lock_kernel();
+ lock_super(sb);
res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (res == -ENOENT)
goto add;
@@ -232,7 +232,7 @@ add:
if (dentry)
dentry->d_op = &msdos_dentry_operations;
out:
- unlock_kernel();
+ unlock_super(sb);
if (!res)
return dentry;
return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
unsigned char msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
err = fat_flush_inodes(sb, dir, inode);
return err;
@@ -324,11 +324,12 @@ out:
/***** Remove a directory */
static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
{
+ struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
/*
* Check whether the directory is not in use, then check
* whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, is_hid, cluster;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
fat_flush_inodes(sb, dir, inode);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -419,10 +420,11 @@ out:
static int msdos_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb= inode->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (err)
goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -618,10 +620,11 @@ error_inode:
static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ struct super_block *sb = old_dir->i_sb;
unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(old_dentry->d_name.name,
old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
new_dir, new_msdos_name, new_dentry, is_hid);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+ err = fat_flush_inodes(sb, old_dir, new_dir);
return err;
}
diff --git a/fs/namei.c b/fs/namei.c
index c7e43536c49..01e67dddcc3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -581,15 +581,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
int result;
/* make sure the stuff we saved doesn't go away */
- dget(save.dentry);
- mntget(save.mnt);
+ path_get(&save);
result = __link_path_walk(name, nd);
if (result == -ESTALE) {
/* nd->path had been dropped */
nd->path = save;
- dget(nd->path.dentry);
- mntget(nd->path.mnt);
+ path_get(&nd->path);
nd->flags |= LOOKUP_REVAL;
result = __link_path_walk(name, nd);
}
@@ -1216,8 +1214,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
nd->flags = flags;
nd->depth = 0;
- nd->path.mnt = mntget(mnt);
- nd->path.dentry = dget(dentry);
+ nd->path.dentry = dentry;
+ nd->path.mnt = mnt;
+ path_get(&nd->path);
retval = path_walk(name, nd);
if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
@@ -2857,16 +2856,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct nameidata nd;
void *cookie;
+ int res;
nd.depth = 0;
cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
- if (!IS_ERR(cookie)) {
- int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
- if (dentry->d_inode->i_op->put_link)
- dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
- cookie = ERR_PTR(res);
- }
- return PTR_ERR(cookie);
+ if (IS_ERR(cookie))
+ return PTR_ERR(cookie);
+
+ res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+ if (dentry->d_inode->i_op->put_link)
+ dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+ return res;
}
int vfs_follow_link(struct nameidata *nd, const char *link)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e..4f6f7635b59 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
const char *str;
};
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{
static const struct proc_fs_info fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
if (sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
+
+ return security_sb_show_options(m, sb);
}
static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
seq_putc(m, ' ');
show_type(m, mnt->mnt_sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
- show_sb_opts(m, mnt->mnt_sb);
+ err = show_sb_opts(m, mnt->mnt_sb);
+ if (err)
+ goto out;
show_mnt_opts(m, mnt);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
+out:
return err;
}
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
seq_putc(m, ' ');
mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
- show_sb_opts(m, sb);
+ err = show_sb_opts(m, sb);
+ if (err)
+ goto out;
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt);
seq_putc(m, '\n');
+out:
return err;
}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b3..6a7d901f193 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/ncp_fs.h>
#include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
return 0;
}
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations ncp_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = ncp_remote_llseek,
.read = ncp_file_read,
.write = ncp_file_write,
.ioctl = ncp_ioctl,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58d43daec08..982a2064fe4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -204,7 +204,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
* Note: assumes we have exclusive access to this mapping either
* through inode->i_mutex or some other mechanism.
*/
- if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+ if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
/* Should never happen */
nfs_zap_mapping(inode, inode->i_mapping);
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32a..4e98a56a177 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,6 +170,7 @@ force_reval:
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
+ loff_t loff;
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
@@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(filp, offset, origin);
+ lock_kernel(); /* BKL needed? */
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
+ unlock_kernel();
+ return loff;
}
/*
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 49c7cd0502c..779d2eb649c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
struct mnt_fhstatus *res)
{
struct nfs_fh *fh = res->fh;
+ unsigned size;
if ((res->status = ntohl(*p++)) == 0) {
- int size = ntohl(*p++);
- if (size <= NFS3_FHSIZE) {
+ size = ntohl(*p++);
+ if (size <= NFS3_FHSIZE && size != 0) {
fh->size = size;
memcpy(fh->data, p, size);
} else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2a4a024a4e7..614efeed543 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options,
{
struct nfs_mount_data *data = (struct nfs_mount_data *)options;
- memset(args, 0, sizeof(*args));
-
if (data == NULL)
goto out_no_data;
@@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options,
case 5:
memset(data->context, 0, sizeof(data->context));
case 6:
- if (data->flags & NFS_MOUNT_VER3)
+ if (data->flags & NFS_MOUNT_VER3) {
+ if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+ goto out_invalid_fh;
mntfh->size = data->root.size;
- else
+ } else
mntfh->size = NFS2_FHSIZE;
- if (mntfh->size > sizeof(mntfh->data))
- goto out_invalid_fh;
memcpy(mntfh->data, data->root.data, mntfh->size);
if (mntfh->size < sizeof(mntfh->data))
@@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,
{
struct nfs_server *server = NULL;
struct super_block *s;
- struct nfs_fh mntfh;
- struct nfs_parsed_mount_data data;
+ struct nfs_parsed_mount_data *data;
+ struct nfs_fh *mntfh;
struct dentry *mntroot;
int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
struct nfs_sb_mountdata sb_mntdata = {
.mntflags = flags,
};
- int error;
+ int error = -ENOMEM;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+ if (data == NULL || mntfh == NULL)
+ goto out_free_fh;
- security_init_mnt_opts(&data.lsm_opts);
+ security_init_mnt_opts(&data->lsm_opts);
/* Validate the mount data */
- error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
+ error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
if (error < 0)
goto out;
/* Get a volume representation */
- server = nfs_create_server(&data, &mntfh);
+ server = nfs_create_server(data, mntfh);
if (IS_ERR(server)) {
error = PTR_ERR(server);
goto out;
@@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
- nfs_fill_super(s, &data);
+ nfs_fill_super(s, data);
}
- mntroot = nfs_get_root(s, &mntfh);
+ mntroot = nfs_get_root(s, mntfh);
if (IS_ERR(mntroot)) {
error = PTR_ERR(mntroot);
goto error_splat_super;
}
- error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+ error = security_sb_set_mnt_opts(s, &data->lsm_opts);
if (error)
goto error_splat_root;
@@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
error = 0;
out:
- kfree(data.nfs_server.hostname);
- kfree(data.mount_server.hostname);
- security_free_mnt_opts(&data.lsm_opts);
+ kfree(data->nfs_server.hostname);
+ kfree(data->mount_server.hostname);
+ security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+ kfree(mntfh);
+ kfree(data);
return error;
out_err_nosb:
@@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options,
struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
char *c;
- memset(args, 0, sizeof(*args));
-
if (data == NULL)
goto out_no_data;
@@ -1959,26 +1963,31 @@ out_no_client_address:
static int nfs4_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
{
- struct nfs_parsed_mount_data data;
+ struct nfs_parsed_mount_data *data;
struct super_block *s;
struct nfs_server *server;
- struct nfs_fh mntfh;
+ struct nfs_fh *mntfh;
struct dentry *mntroot;
int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
struct nfs_sb_mountdata sb_mntdata = {
.mntflags = flags,
};
- int error;
+ int error = -ENOMEM;
- security_init_mnt_opts(&data.lsm_opts);
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+ if (data == NULL || mntfh == NULL)
+ goto out_free_fh;
+
+ security_init_mnt_opts(&data->lsm_opts);
/* Validate the mount data */
- error = nfs4_validate_mount_data(raw_data, &data, dev_name);
+ error = nfs4_validate_mount_data(raw_data, data, dev_name);
if (error < 0)
goto out;
/* Get a volume representation */
- server = nfs4_create_server(&data, &mntfh);
+ server = nfs4_create_server(data, mntfh);
if (IS_ERR(server)) {
error = PTR_ERR(server);
goto out;
@@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
nfs4_fill_super(s);
}
- mntroot = nfs4_get_root(s, &mntfh);
+ mntroot = nfs4_get_root(s, mntfh);
if (IS_ERR(mntroot)) {
error = PTR_ERR(mntroot);
goto error_splat_super;
}
- error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+ error = security_sb_set_mnt_opts(s, &data->lsm_opts);
if (error)
goto error_splat_root;
@@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
error = 0;
out:
- kfree(data.client_address);
- kfree(data.nfs_server.export_path);
- kfree(data.nfs_server.hostname);
- security_free_mnt_opts(&data.lsm_opts);
+ kfree(data->client_address);
+ kfree(data->nfs_server.export_path);
+ kfree(data->nfs_server.hostname);
+ security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+ kfree(mntfh);
+ kfree(data);
return error;
out_free:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6d8ace3e325..f333848fd3b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page,
}
status = nfs_writepage_setup(ctx, page, offset, count);
- __set_page_dirty_nobuffers(page);
+ if (status < 0)
+ nfs_set_pageerror(page);
+ else
+ __set_page_dirty_nobuffers(page);
dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n",
status, (long long)i_size_read(inode));
- if (status < 0)
- nfs_set_pageerror(page);
return status;
}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 9101807dc81..e2f72ca9803 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -77,11 +77,10 @@ ntfschar *generate_default_upcase(void)
uc[i] = cpu_to_le16(i);
for (r = 0; uc_run_table[r][0]; r++)
for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
- uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) +
- uc_run_table[r][2]);
+ le16_add_cpu(&uc[i], uc_run_table[r][2]);
for (r = 0; uc_dup_table[r][0]; r++)
for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
- uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
+ le16_add_cpu(&uc[i + 1], -1);
for (r = 0; uc_word_table[r][0]; r++)
uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
return uc;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf9401e8cd0..cfdb08b484e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -21,7 +21,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/sysctl.h>
#include <linux/configfs.h>
#include "tcp.h"
@@ -36,65 +35,6 @@
* cluster references throughout where nodes are looked up */
struct o2nm_cluster *o2nm_single_cluster = NULL;
-#define OCFS2_MAX_HB_CTL_PATH 256
-static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
-
-static ctl_table ocfs2_nm_table[] = {
- {
- .ctl_name = 1,
- .procname = "hb_ctl_path",
- .data = ocfs2_hb_ctl_path,
- .maxlen = OCFS2_MAX_HB_CTL_PATH,
- .mode = 0644,
- .proc_handler = &proc_dostring,
- .strategy = &sysctl_string,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ocfs2_mod_table[] = {
- {
- .ctl_name = FS_OCFS2_NM,
- .procname = "nm",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_nm_table
- },
- { .ctl_name = 0}
-};
-
-static ctl_table ocfs2_kern_table[] = {
- {
- .ctl_name = FS_OCFS2,
- .procname = "ocfs2",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_mod_table
- },
- { .ctl_name = 0}
-};
-
-static ctl_table ocfs2_root_table[] = {
- {
- .ctl_name = CTL_FS,
- .procname = "fs",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_kern_table
- },
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header *ocfs2_table_header = NULL;
-
-const char *o2nm_get_hb_ctl_path(void)
-{
- return ocfs2_hb_ctl_path;
-}
-EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
{
@@ -941,9 +881,6 @@ void o2nm_undepend_this_node(void)
static void __exit exit_o2nm(void)
{
- if (ocfs2_table_header)
- unregister_sysctl_table(ocfs2_table_header);
-
/* XXX sync with hb callbacks and shut down hb? */
o2net_unregister_hb_callbacks();
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
@@ -964,16 +901,9 @@ static int __init init_o2nm(void)
if (ret)
goto out;
- ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
- if (!ocfs2_table_header) {
- printk(KERN_ERR "nodemanager: unable to register sysctl\n");
- ret = -ENOMEM; /* or something. */
- goto out_o2net;
- }
-
ret = o2net_register_hb_callbacks();
if (ret)
- goto out_sysctl;
+ goto out_o2net;
config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
@@ -990,8 +920,6 @@ static int __init init_o2nm(void)
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
out_callbacks:
o2net_unregister_hb_callbacks();
-out_sysctl:
- unregister_sysctl_table(ocfs2_table_header);
out_o2net:
o2net_exit();
out:
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 7c860361b8d..c992ea0da4a 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,10 +33,6 @@
#include <linux/configfs.h>
#include <linux/rbtree.h>
-#define FS_OCFS2_NM 1
-
-const char *o2nm_get_hb_ctl_path(void);
-
struct o2nm_node {
spinlock_t nd_lock;
struct config_item nd_item;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1e44ad14881..a27d61581bd 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,53 +142,43 @@ static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
-static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node)
-{
#ifdef CONFIG_DEBUG_FS
+void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node)
+{
INIT_LIST_HEAD(&nst->st_net_debug_item);
nst->st_task = task;
nst->st_msg_type = msgtype;
nst->st_msg_key = msgkey;
nst->st_node = node;
-#endif
}
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
{
-#ifdef CONFIG_DEBUG_FS
do_gettimeofday(&nst->st_sock_time);
-#endif
}
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
{
-#ifdef CONFIG_DEBUG_FS
do_gettimeofday(&nst->st_send_time);
-#endif
}
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
{
-#ifdef CONFIG_DEBUG_FS
do_gettimeofday(&nst->st_status_time);
-#endif
}
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
struct o2net_sock_container *sc)
{
-#ifdef CONFIG_DEBUG_FS
nst->st_sc = sc;
-#endif
}
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
{
-#ifdef CONFIG_DEBUG_FS
nst->st_id = msg_id;
-#endif
}
+#endif /* CONFIG_DEBUG_FS */
static inline int o2net_reconnect_delay(void)
{
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index a705d5d1903..fd6179eb26d 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -128,23 +128,23 @@ void o2net_debug_del_nst(struct o2net_send_tracking *nst);
void o2net_debug_add_sc(struct o2net_sock_container *sc);
void o2net_debug_del_sc(struct o2net_sock_container *sc);
#else
-static int o2net_debugfs_init(void)
+static inline int o2net_debugfs_init(void)
{
return 0;
}
-static void o2net_debugfs_exit(void)
+static inline void o2net_debugfs_exit(void)
{
}
-static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
{
}
-static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
{
}
-static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
{
}
-static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
{
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b..18307ff81b7 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,10 +224,42 @@ struct o2net_send_tracking {
struct timeval st_send_time;
struct timeval st_status_time;
};
+
+void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node);
+void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc);
+void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
+
#else
struct o2net_send_tracking {
u32 dummy;
};
+
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+ u32 msg_id)
+{
+}
#endif /* CONFIG_DEBUG_FS */
#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index d34a62a3a62..8c686d22f9c 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -60,25 +60,25 @@ void dlm_destroy_debugfs_root(void);
#else
-static int dlm_debug_init(struct dlm_ctxt *dlm)
+static inline int dlm_debug_init(struct dlm_ctxt *dlm)
{
return 0;
}
-static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
{
}
-static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
return 0;
}
-static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
}
-static int dlm_create_debugfs_root(void)
+static inline int dlm_create_debugfs_root(void)
{
return 0;
}
-static void dlm_destroy_debugfs_root(void)
+static inline void dlm_destroy_debugfs_root(void)
{
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128..44f87caf368 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
+ spin_lock(&dlm->spinlock);
list_add_tail(&res->tracking, &dlm->tracking_list);
+ spin_unlock(&dlm->spinlock);
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a..80e20d9f278 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
*/
int ocfs2_file_lock(struct file *file, int ex, int trylock)
{
- int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
- unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+ int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
unsigned long flags;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
* Get the lock at NLMODE to start - that way we
* can cancel the upconvert request if need be.
*/
- ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+ ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
}
lockres->l_action = OCFS2_AST_CONVERT;
- lkm_flags |= LKM_CONVERT;
+ lkm_flags |= DLM_LKF_CONVERT;
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
return;
- if (lockres->l_level == LKM_NLMODE)
+ if (lockres->l_level == DLM_LOCK_NL)
return;
mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
lockres->l_blocking = DLM_LOCK_EX;
- gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+ gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+ ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
if (ret) {
mlog_errno(ret);
return;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bbd1667aa7d..fcd120f1493 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -317,8 +317,7 @@ out:
return rc;
}
-static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
- int hangup_pending)
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
{
struct dlm_ctxt *dlm = conn->cc_lockspace;
struct o2dlm_private *priv = conn->cc_private;
@@ -333,43 +332,6 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
return 0;
}
-static void o2hb_stop(const char *group)
-{
- int ret;
- char *argv[5], *envp[3];
-
- argv[0] = (char *)o2nm_get_hb_ctl_path();
- argv[1] = "-K";
- argv[2] = "-u";
- argv[3] = (char *)group;
- argv[4] = NULL;
-
- mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
- /* minimal command environment taken from cpu_run_sbin_hotplug */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- if (ret < 0)
- mlog_errno(ret);
-}
-
-/*
- * Hangup is a hack for tools compatibility. Older ocfs2-tools software
- * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This
- * happens regardless of whether the DLM got started, so we can't do it
- * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into
- * the glue and provide a "hangup" API for super.c to call.
- *
- * Other stacks will eventually provide a NULL ->hangup() pointer.
- */
-static void o2cb_cluster_hangup(const char *group, int grouplen)
-{
- o2hb_stop(group);
-}
-
static int o2cb_cluster_this_node(unsigned int *node)
{
int node_num;
@@ -388,7 +350,6 @@ static int o2cb_cluster_this_node(unsigned int *node)
static struct ocfs2_stack_operations o2cb_stack_ops = {
.connect = o2cb_cluster_connect,
.disconnect = o2cb_cluster_disconnect,
- .hangup = o2cb_cluster_hangup,
.this_node = o2cb_cluster_this_node,
.dlm_lock = o2cb_dlm_lock,
.dlm_unlock = o2cb_dlm_unlock,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index b503772cd0e..bd7e0f3acfc 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
+#include <linux/smp_lock.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
@@ -61,7 +62,7 @@
* negotiated by the client. The client negotiates based on the maximum
* version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
* number from the "SETV" message must match
- * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
* must be less than or equal to ...->lp_max_version.pv_minor.
*
* Once this information has been set, mounts will be allowed. From this
@@ -153,7 +154,7 @@ union ocfs2_control_message {
struct ocfs2_control_message_down u_down;
};
-static struct ocfs2_stack_plugin user_stack;
+static struct ocfs2_stack_plugin ocfs2_user_plugin;
static atomic_t ocfs2_control_opened;
static int ocfs2_control_this_node = -1;
@@ -399,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
- &user_stack.sp_proto->lp_max_version;
+ &ocfs2_user_plugin.sp_proto->lp_max_version;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
+ lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
+ unlock_kernel();
return 0;
}
@@ -680,7 +683,7 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
int status = lksb->sb_status;
- BUG_ON(user_stack.sp_proto == NULL);
+ BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
/*
* For now we're punting on the issue of other non-standard errors
@@ -693,16 +696,16 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
*/
if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
- user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+ ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
else
- user_stack.sp_proto->lp_lock_ast(astarg);
+ ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
}
static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
{
- BUG_ON(user_stack.sp_proto == NULL);
+ BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
- user_stack.sp_proto->lp_blocking_ast(astarg, level);
+ ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
}
static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -816,8 +819,7 @@ out:
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
- int hangup_pending)
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
{
dlm_release_lockspace(conn->cc_lockspace, 2);
conn->cc_lockspace = NULL;
@@ -838,7 +840,7 @@ static int user_cluster_this_node(unsigned int *this_node)
return 0;
}
-static struct ocfs2_stack_operations user_stack_ops = {
+static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
.connect = user_cluster_connect,
.disconnect = user_cluster_disconnect,
.this_node = user_cluster_this_node,
@@ -849,20 +851,20 @@ static struct ocfs2_stack_operations user_stack_ops = {
.dump_lksb = user_dlm_dump_lksb,
};
-static struct ocfs2_stack_plugin user_stack = {
+static struct ocfs2_stack_plugin ocfs2_user_plugin = {
.sp_name = "user",
- .sp_ops = &user_stack_ops,
+ .sp_ops = &ocfs2_user_plugin_ops,
.sp_owner = THIS_MODULE,
};
-static int __init user_stack_init(void)
+static int __init ocfs2_user_plugin_init(void)
{
int rc;
rc = ocfs2_control_init();
if (!rc) {
- rc = ocfs2_stack_glue_register(&user_stack);
+ rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
if (rc)
ocfs2_control_exit();
}
@@ -870,14 +872,14 @@ static int __init user_stack_init(void)
return rc;
}
-static void __exit user_stack_exit(void)
+static void __exit ocfs2_user_plugin_exit(void)
{
- ocfs2_stack_glue_unregister(&user_stack);
+ ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
ocfs2_control_exit();
}
MODULE_AUTHOR("Oracle");
MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
MODULE_LICENSE("GPL");
-module_init(user_stack_init);
-module_exit(user_stack_exit);
+module_init(ocfs2_user_plugin_init);
+module_exit(ocfs2_user_plugin_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 119f60cea9c..10e149ae5e3 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -26,6 +26,7 @@
#include <linux/fs.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/sysctl.h>
#include "ocfs2_fs.h"
@@ -33,11 +34,13 @@
#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
#define OCFS2_STACK_PLUGIN_USER "user"
+#define OCFS2_MAX_HB_CTL_PATH 256
static struct ocfs2_locking_protocol *lproto;
static DEFINE_SPINLOCK(ocfs2_stack_lock);
static LIST_HEAD(ocfs2_stack_list);
static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
/*
* The stack currently in use. If not null, active_stack->sp_count > 0,
@@ -349,7 +352,7 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
BUG_ON(conn == NULL);
- ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+ ret = active_stack->sp_ops->disconnect(conn);
/* XXX Should we free it anyway? */
if (!ret) {
@@ -362,13 +365,48 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+/*
+ * Leave the group for this filesystem. This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+ int ret;
+ char *argv[5], *envp[3];
+
+ argv[0] = ocfs2_hb_ctl_path;
+ argv[1] = "-K";
+ argv[2] = "-u";
+ argv[3] = (char *)group;
+ argv[4] = NULL;
+
+ /* minimal command environment taken from cpu_run_sbin_hotplug */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (ret < 0) {
+ printk(KERN_ERR
+ "ocfs2: Error %d running user helper "
+ "\"%s %s %s %s\"\n",
+ ret, argv[0], argv[1], argv[2], argv[3]);
+ }
+}
+
+/*
+ * Hangup is a required post-umount. ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount. This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does
+ * the actual work.
+ */
void ocfs2_cluster_hangup(const char *group, int grouplen)
{
BUG_ON(group == NULL);
BUG_ON(group[grouplen] != '\0');
- if (active_stack->sp_ops->hangup)
- active_stack->sp_ops->hangup(group, grouplen);
+ ocfs2_leave_group(group);
/* cluster_disconnect() was called with hangup_pending==1 */
ocfs2_stack_driver_put();
@@ -548,10 +586,83 @@ error:
return ret;
}
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+
+#define FS_OCFS2_NM 1
+
+static ctl_table ocfs2_nm_table[] = {
+ {
+ .ctl_name = 1,
+ .procname = "hb_ctl_path",
+ .data = ocfs2_hb_ctl_path,
+ .maxlen = OCFS2_MAX_HB_CTL_PATH,
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ .strategy = &sysctl_string,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+ {
+ .ctl_name = FS_OCFS2_NM,
+ .procname = "nm",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ocfs2_nm_table
+ },
+ { .ctl_name = 0}
+};
+
+static ctl_table ocfs2_kern_table[] = {
+ {
+ .ctl_name = FS_OCFS2,
+ .procname = "ocfs2",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ocfs2_mod_table
+ },
+ { .ctl_name = 0}
+};
+
+static ctl_table ocfs2_root_table[] = {
+ {
+ .ctl_name = CTL_FS,
+ .procname = "fs",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ocfs2_kern_table
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+
+/*
+ * Initialization
+ */
+
static int __init ocfs2_stack_glue_init(void)
{
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+ ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+ if (!ocfs2_table_header) {
+ printk(KERN_ERR
+ "ocfs2 stack glue: unable to register sysctl\n");
+ return -ENOMEM; /* or something. */
+ }
+
return ocfs2_sysfs_init();
}
@@ -559,6 +670,8 @@ static void __exit ocfs2_stack_glue_exit(void)
{
lproto = NULL;
ocfs2_sysfs_exit();
+ if (ocfs2_table_header)
+ unregister_sysctl_table(ocfs2_table_header);
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 005e4f170e0..db56281dd1b 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -134,22 +134,10 @@ struct ocfs2_stack_operations {
* be freed. Thus, a stack must not return from ->disconnect()
* until it will no longer reference the conn pointer.
*
- * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
- * be dropping the reference on the module.
+ * Once this call returns, the stack glue will be dropping this
+ * connection's reference on the module.
*/
- int (*disconnect)(struct ocfs2_cluster_connection *conn,
- int hangup_pending);
-
- /*
- * ocfs2_cluster_hangup() exists for compatibility with older
- * ocfs2 tools. Only the classic stack really needs it. As such
- * ->hangup() is not required of all stacks. See the comment by
- * ocfs2_cluster_hangup() for more details.
- *
- * Note that ocfs2_cluster_hangup() can only be called if
- * hangup_pending was passed to ocfs2_cluster_disconnect().
- */
- void (*hangup)(const char *group, int grouplen);
+ int (*disconnect)(struct ocfs2_cluster_connection *conn);
/*
* ->this_node() returns the cluster's unique identifier for the
@@ -258,4 +246,5 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
/* Used by stack plugins */
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+
#endif /* STACKGLUE_H */
diff --git a/fs/open.c b/fs/open.c
index a1450086e92..a99ad09c319 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
+#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
{
struct nameidata nd;
int old_fsuid, old_fsgid;
- kernel_cap_t old_cap;
+ kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */
int res;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
old_fsuid = current->fsuid;
old_fsgid = current->fsgid;
- old_cap = current->cap_effective;
current->fsuid = current->uid;
current->fsgid = current->gid;
- /*
- * Clear the capabilities if we switch to a non-root user
- *
- * FIXME: There is a race here against sys_capset. The
- * capabilities can change yet we will restore the old
- * value below. We should hold task_capabilities_lock,
- * but we cannot because user_path_walk can sleep.
- */
- if (current->uid)
- cap_clear(current->cap_effective);
- else
- current->cap_effective = current->cap_permitted;
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ /*
+ * Clear the capabilities if we switch to a non-root user
+ */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+ /*
+ * FIXME: There is a race here against sys_capset. The
+ * capabilities can change yet we will restore the old
+ * value below. We should hold task_capabilities_lock,
+ * but we cannot because user_path_walk can sleep.
+ */
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+ if (current->uid)
+ old_cap = cap_set_effective(__cap_empty_set);
+ else
+ old_cap = cap_set_effective(current->cap_permitted);
+ }
res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
if (res)
@@ -478,7 +483,9 @@ out_path_release:
out:
current->fsuid = old_fsuid;
current->fsgid = old_fsgid;
- current->cap_effective = old_cap;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP))
+ cap_set_effective(old_cap);
return res;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index ec228bc9f88..700f4e0d957 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1003,8 +1003,7 @@ struct file *create_write_pipe(void)
void free_write_pipe(struct file *f)
{
free_pipe_info(f->f_dentry->d_inode);
- dput(f->f_path.dentry);
- mntput(f->f_path.mnt);
+ path_put(&f->f_path);
put_filp(f);
}
@@ -1015,8 +1014,8 @@ struct file *create_read_pipe(struct file *wrf)
return ERR_PTR(-ENFILE);
/* Grab pipe from the writer */
- f->f_path.mnt = mntget(wrf->f_path.mnt);
- f->f_path.dentry = dget(wrf->f_path.dentry);
+ f->f_path = wrf->f_path;
+ path_get(&wrf->f_path);
f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
f->f_pos = 0;
@@ -1068,8 +1067,7 @@ int do_pipe(int *fd)
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
- dput(fr->f_dentry);
- mntput(fr->f_vfsmnt);
+ path_put(&fr->f_path);
put_filp(fr);
err_write_pipe:
free_write_pipe(fw);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9e3b8c33c24..797d775e035 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -288,7 +288,7 @@ static void render_cap_t(struct seq_file *m, const char *header,
seq_printf(m, "%s", header);
CAP_FOR_EACH_U32(__capi) {
seq_printf(m, "%08x",
- a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]);
+ a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
}
seq_printf(m, "\n");
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c447e0743a3..58c3e6a8e15 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -127,6 +127,25 @@ struct pid_entry {
NULL, &proc_single_file_operations, \
{ .proc_show = &proc_##OTYPE } )
+/*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+ */
+static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+ unsigned int n)
+{
+ unsigned int i;
+ unsigned int count;
+
+ count = 0;
+ for (i = 0; i < n; ++i) {
+ if (S_ISDIR(entries[i].mode))
+ ++count;
+ }
+
+ return count;
+}
+
int maps_protect;
EXPORT_SYMBOL(maps_protect);
@@ -214,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
*/
if (task->parent == current && (task->ptrace & PT_PTRACED) &&
task_is_stopped_or_traced(task) &&
- ptrace_may_attach(task))
+ ptrace_may_access(task, PTRACE_MODE_ATTACH))
return 0;
/*
@@ -232,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
task_lock(task);
if (task->mm != mm)
goto out;
- if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+ if (task->mm != current->mm &&
+ __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
goto out;
task_unlock(task);
return mm;
@@ -499,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_attach(task);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ);
put_task_struct(task);
}
return allowed;
@@ -885,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!task)
goto out_no_task;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out;
ret = -ENOMEM;
@@ -2585,10 +2605,9 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- inode->i_nlink = 5;
-#ifdef CONFIG_SECURITY
- inode->i_nlink += 1;
-#endif
+
+ inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
+ ARRAY_SIZE(tgid_base_stuff));
dentry->d_op = &pid_dentry_operations;
@@ -2816,10 +2835,9 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- inode->i_nlink = 4;
-#ifdef CONFIG_SECURITY
- inode->i_nlink += 1;
-#endif
+
+ inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
+ ARRAY_SIZE(tid_base_stuff));
dentry->d_op = &pid_dentry_operations;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6f4e8dc97da..b08d1001791 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -425,7 +425,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
}
}
unlock_new_inode(inode);
- }
+ } else
+ module_put(de->owner);
return inode;
out_ino:
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 74a323d2b85..c652d469dc0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+ return 0;
+}
+
static int meminfo_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -139,7 +144,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
- committed = atomic_read(&vm_committed_space);
+ committed = atomic_long_read(&vm_committed_space);
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100) + total_swap_pages;
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
len += hugetlb_report_meminfo(page + len);
+ len += arch_report_meminfo(page + len);
+
return proc_calc_metrics(page, start, off, count, eof, len);
#undef K
}
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
};
#endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
static int show_stat(struct seq_file *p, void *v)
{
int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
sum += temp;
per_irq_sum[j] += temp;
}
+ sum += arch_irq_stat_cpu(i);
}
+ sum += arch_irq_stat();
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
@@ -716,7 +732,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
pfn = src / KPMSIZE;
count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
- return -EIO;
+ return -EINVAL;
while (count > 0) {
ppage = NULL;
@@ -726,7 +742,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
if (!ppage)
pcount = 0;
else
- pcount = atomic_read(&ppage->_count);
+ pcount = page_mapcount(ppage);
if (put_user(pcount, out++)) {
ret = -EFAULT;
@@ -782,7 +798,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
pfn = src / KPMSIZE;
count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
- return -EIO;
+ return -EINVAL;
while (count > 0) {
ppage = NULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 88717c0f941..164bd9f9ede 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
dev_t dev = 0;
int len;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
if (file) {
@@ -315,9 +315,9 @@ struct mem_size_stats {
};
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- void *private)
+ struct mm_walk *walk)
{
- struct mem_size_stats *mss = private;
+ struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
@@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return 0;
}
-static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
-
static int show_smap(struct seq_file *m, void *v)
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
int ret;
+ struct mm_walk smaps_walk = {
+ .pmd_entry = smaps_pte_range,
+ .mm = vma->vm_mm,
+ .private = &mss,
+ };
memset(&mss, 0, sizeof mss);
mss.vma = vma;
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
- &smaps_walk, &mss);
+ walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
ret = show_map(m, v);
if (ret)
@@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = {
};
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, void *private)
+ unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = private;
+ struct vm_area_struct *vma = walk->private;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
-
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct mm_walk clear_refs_walk = {
+ .pmd_entry = clear_refs_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ clear_refs_walk.private = vma;
if (!is_vm_hugetlb_page(vma))
- walk_page_range(mm, vma->vm_start, vma->vm_end,
- &clear_refs_walk, vma);
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &clear_refs_walk);
+ }
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
@@ -496,7 +502,7 @@ const struct file_operations proc_clear_refs_operations = {
};
struct pagemapread {
- char __user *out, *end;
+ u64 __user *out, *end;
};
#define PM_ENTRY_BYTES sizeof(u64)
@@ -519,28 +525,18 @@ struct pagemapread {
static int add_to_pagemap(unsigned long addr, u64 pfn,
struct pagemapread *pm)
{
- /*
- * Make sure there's room in the buffer for an
- * entire entry. Otherwise, only copy part of
- * the pfn.
- */
- if (pm->out + PM_ENTRY_BYTES >= pm->end) {
- if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
- return -EFAULT;
- pm->out = pm->end;
- return PM_END_OF_BUFFER;
- }
-
if (put_user(pfn, pm->out))
return -EFAULT;
- pm->out += PM_ENTRY_BYTES;
+ pm->out++;
+ if (pm->out >= pm->end)
+ return PM_END_OF_BUFFER;
return 0;
}
static int pagemap_pte_hole(unsigned long start, unsigned long end,
- void *private)
+ struct mm_walk *walk)
{
- struct pagemapread *pm = private;
+ struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -557,24 +553,45 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
}
+static unsigned long pte_to_pagemap_entry(pte_t pte)
+{
+ unsigned long pme = 0;
+ if (is_swap_pte(pte))
+ pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+ | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+ else if (pte_present(pte))
+ pme = PM_PFRAME(pte_pfn(pte))
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+ return pme;
+}
+
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- void *private)
+ struct mm_walk *walk)
{
- struct pagemapread *pm = private;
+ struct vm_area_struct *vma;
+ struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
+ /* find the first VMA at or above 'addr' */
+ vma = find_vma(walk->mm, addr);
for (; addr != end; addr += PAGE_SIZE) {
u64 pfn = PM_NOT_PRESENT;
- pte = pte_offset_map(pmd, addr);
- if (is_swap_pte(*pte))
- pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
- else if (pte_present(*pte))
- pfn = PM_PFRAME(pte_pfn(*pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- /* unmap so we're not in atomic when we copy to userspace */
- pte_unmap(pte);
+
+ /* check to see if we've left 'vma' behind
+ * and need a new, higher one */
+ if (vma && (addr >= vma->vm_end))
+ vma = find_vma(walk->mm, addr);
+
+ /* check that 'vma' actually covers this address,
+ * and that it isn't a huge page vma */
+ if (vma && (vma->vm_start <= addr) &&
+ !is_vm_hugetlb_page(vma)) {
+ pte = pte_offset_map(pmd, addr);
+ pfn = pte_to_pagemap_entry(*pte);
+ /* unmap before userspace copy */
+ pte_unmap(pte);
+ }
err = add_to_pagemap(addr, pfn, pm);
if (err)
return err;
@@ -585,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
-static struct mm_walk pagemap_walk = {
- .pmd_entry = pagemap_pte_range,
- .pte_hole = pagemap_pte_hole
-};
-
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
*
@@ -624,17 +636,22 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
struct pagemapread pm;
int pagecount;
int ret = -ESRCH;
+ struct mm_walk pagemap_walk;
+ unsigned long src;
+ unsigned long svpfn;
+ unsigned long start_vaddr;
+ unsigned long end_vaddr;
if (!task)
goto out;
ret = -EACCES;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_task;
ret = -EINVAL;
/* file position must be aligned */
- if (*ppos % PM_ENTRY_BYTES)
+ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
goto out_task;
ret = 0;
@@ -642,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!mm)
goto out_task;
- ret = -ENOMEM;
+
uaddr = (unsigned long)buf & PAGE_MASK;
uend = (unsigned long)(buf + count);
pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
- pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+ ret = 0;
+ if (pagecount == 0)
+ goto out_mm;
+ pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+ ret = -ENOMEM;
if (!pages)
goto out_mm;
@@ -664,36 +685,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
goto out_pages;
}
- pm.out = buf;
- pm.end = buf + count;
+ pm.out = (u64 *)buf;
+ pm.end = (u64 *)(buf + count);
- if (!ptrace_may_attach(task)) {
- ret = -EIO;
- } else {
- unsigned long src = *ppos;
- unsigned long svpfn = src / PM_ENTRY_BYTES;
- unsigned long start_vaddr = svpfn << PAGE_SHIFT;
- unsigned long end_vaddr = TASK_SIZE_OF(task);
-
- /* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
- start_vaddr = end_vaddr;
-
- /*
- * The odds are that this will stop walking way
- * before end_vaddr, because the length of the
- * user buffer is tracked in "pm", and the walk
- * will stop when we hit the end of the buffer.
- */
- ret = walk_page_range(mm, start_vaddr, end_vaddr,
- &pagemap_walk, &pm);
- if (ret == PM_END_OF_BUFFER)
- ret = 0;
- /* don't need mmap_sem for these, but this looks cleaner */
- *ppos += pm.out - buf;
- if (!ret)
- ret = pm.out - buf;
- }
+ pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pte_hole = pagemap_pte_hole;
+ pagemap_walk.mm = mm;
+ pagemap_walk.private = &pm;
+
+ src = *ppos;
+ svpfn = src / PM_ENTRY_BYTES;
+ start_vaddr = svpfn << PAGE_SHIFT;
+ end_vaddr = TASK_SIZE_OF(task);
+
+ /* watch out for wraparound */
+ if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ start_vaddr = end_vaddr;
+
+ /*
+ * The odds are that this will stop walking way
+ * before end_vaddr, because the length of the
+ * user buffer is tracked in "pm", and the walk
+ * will stop when we hit the end of the buffer.
+ */
+ ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+ if (ret == PM_END_OF_BUFFER)
+ ret = 0;
+ /* don't need mmap_sem for these, but this looks cleaner */
+ *ppos += (char *)pm.out - buf;
+ if (!ret)
+ ret = (char *)pm.out - buf;
out_pages:
for (; pagecount; pagecount--) {
@@ -726,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f18..5d84e7121df 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b902430..78f613cb9c7 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f6..52312ec93ff 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
.aio_write = generic_file_aio_write,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c6..9ba495d5a29 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
{
loff_t retval;
struct inode *inode = file->f_mapping->host;
- mutex_lock(&inode->i_mutex);
switch (origin) {
case SEEK_END:
offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
}
retval = -EINVAL;
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+ /* Special lock needed here? */
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
- mutex_unlock(&inode->i_mutex);
return retval;
}
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- loff_t retval;
-
- lock_kernel();
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(file->f_path.dentry->d_inode);
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
- unlock_kernel();
- return retval;
+ loff_t n;
+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
+ n = generic_file_llseek_unlocked(file, offset, origin);
+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+ return n;
}
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212..192269698a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
goto out;
reiserfs_update_inode_transaction(inode);
+ reiserfs_discard_prealloc(&th, inode);
+
err = reiserfs_delete_object(&th, inode);
/* Do quota update inside a transaction for journaled quotas. We must do that
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e6..1d40f2bd197 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
inode->i_version++;
diff --git a/fs/select.c b/fs/select.c
index 8dda969614a..da0e88201c3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
retval++;
}
}
- cond_resched();
}
if (res_in)
*rinp = res_in;
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
+ cond_resched();
}
wait = NULL;
if (retval || !*timeout || signal_pending(current))
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7..2294783320c 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
return error;
}
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations smb_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = smb_remote_llseek,
.read = do_sync_read,
.aio_read = smb_file_aio_read,
.write = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index 78150038b58..399442179d8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
*/
wait_on_page_writeback(page);
- if (PagePrivate(page))
- try_to_release_page(page, GFP_KERNEL);
+ if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ goto out_unlock;
/*
* If we succeeded in removing the mapping, set LRU flag
@@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
* Raced with truncate or failed to remove page from current
* address space, unlock and return failure.
*/
+out_unlock:
unlock_page(page);
return 1;
}
@@ -378,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
lock_page(page);
/*
- * page was truncated, stop here. if this isn't the
- * first page, we'll just complete what we already
- * added
+ * Page was truncated, or invalidated by the
+ * filesystem. Redo the find/create, but this time the
+ * page is kept locked, so there's no chance of another
+ * race with truncate/invalidate.
*/
if (!page->mapping) {
unlock_page(page);
- break;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+
+ if (!page) {
+ error = -ENOMEM;
+ break;
+ }
+ page_cache_release(pages[page_nr]);
+ pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
@@ -983,7 +993,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
while (len) {
size_t read_len;
- loff_t pos = sd->pos;
+ loff_t pos = sd->pos, prev_pos = pos;
ret = do_splice_to(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
@@ -998,15 +1008,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* could get stuck data in the internal pipe:
*/
ret = actor(pipe, sd);
- if (unlikely(ret <= 0))
+ if (unlikely(ret <= 0)) {
+ sd->pos = prev_pos;
goto out_release;
+ }
bytes += ret;
len -= ret;
sd->pos = pos;
- if (ret < read_len)
+ if (ret < read_len) {
+ sd->pos = prev_pos + ret;
goto out_release;
+ }
}
done:
@@ -1072,7 +1086,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
if (ret > 0)
- *ppos += ret;
+ *ppos = sd.pos;
return ret;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7a5f69be6ac..44cc702f96c 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent)
/*
* Check whether there is an anchor block in the given block
*/
-static int udf_check_anchor_block(struct super_block *sb, sector_t block,
- bool varconv)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block)
{
- struct buffer_head *bh = NULL;
- tag *t;
+ struct buffer_head *bh;
uint16_t ident;
- uint32_t location;
- if (varconv) {
- if (udf_fixed_to_variable(block) >=
- sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
- return 0;
- bh = sb_bread(sb, udf_fixed_to_variable(block));
- }
- else
- bh = sb_bread(sb, block);
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+ udf_fixed_to_variable(block) >=
+ sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+ return 0;
+ bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
return 0;
-
- t = (tag *)bh->b_data;
- ident = le16_to_cpu(t->tagIdent);
- location = le32_to_cpu(t->tagLocation);
brelse(bh);
- if (ident != TAG_IDENT_AVDP)
- return 0;
- return location == block;
+
+ return ident == TAG_IDENT_AVDP;
}
/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
- sector_t lastblock)
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
{
sector_t last[6];
int i;
@@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
sb->s_blocksize_bits)
continue;
- if (udf_check_anchor_block(sb, last[i], varconv)) {
+ if (udf_check_anchor_block(sb, last[i])) {
sbi->s_anchor[0] = last[i];
sbi->s_anchor[1] = last[i] - 256;
return last[i];
@@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
if (last[i] < 256)
continue;
- if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+ if (udf_check_anchor_block(sb, last[i] - 256)) {
sbi->s_anchor[1] = last[i] - 256;
return last[i];
}
}
- if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+ if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
sbi->s_anchor[0] = sbi->s_session + 256;
return last[0];
}
- if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+ if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
sbi->s_anchor[0] = sbi->s_session + 512;
return last[0];
}
@@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb)
int i;
struct udf_sb_info *sbi = UDF_SB(sb);
- lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+ lastblock = udf_scan_anchors(sb, sbi->s_last_block);
if (lastblock)
goto check_anchor;
/* No anchor found? Try VARCONV conversion of block numbers */
+ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
/* Firstly, we try to not convert number of the last block */
- lastblock = udf_scan_anchors(sb, 1,
+ lastblock = udf_scan_anchors(sb,
udf_variable_to_fixed(sbi->s_last_block));
- if (lastblock) {
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ if (lastblock)
goto check_anchor;
- }
/* Secondly, we try with converted number of the last block */
- lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
- if (lastblock)
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ lastblock = udf_scan_anchors(sb, sbi->s_last_block);
+ if (!lastblock) {
+ /* VARCONV didn't help. Clear it. */
+ UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+ }
check_anchor:
/*
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8fa9c2d7091..8ec865de5f1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -16,7 +16,7 @@
#define UDF_PREALLOCATE
#define UDF_DEFAULT_PREALLOC_BLOCKS 8
-#define UDFFS_DEBUG
+#undef UDFFS_DEBUG
#ifdef UDFFS_DEBUG
#define udf_debug(f, a...) \
diff --git a/fs/utimes.c b/fs/utimes.c
index af059d5cb48..b6b664e7145 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -40,14 +40,9 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
#endif
-static bool nsec_special(long nsec)
-{
- return nsec == UTIME_OMIT || nsec == UTIME_NOW;
-}
-
static bool nsec_valid(long nsec)
{
- if (nsec_special(nsec))
+ if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
return true;
return nsec >= 0 && nsec <= 999999999;
@@ -102,7 +97,11 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
if (error)
goto dput_and_out;
- /* Don't worry, the checks are done in inode_change_ok() */
+ if (times && times[0].tv_nsec == UTIME_NOW &&
+ times[1].tv_nsec == UTIME_NOW)
+ times = NULL;
+
+ /* In most cases, the checks are done in inode_change_ok() */
newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
if (times) {
error = -EPERM;
@@ -124,28 +123,34 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
newattrs.ia_valid |= ATTR_MTIME_SET;
}
- }
- /*
- * If times is NULL or both times are either UTIME_OMIT or
- * UTIME_NOW, then need to check permissions, because
- * inode_change_ok() won't do it.
- */
- if (!times || (nsec_special(times[0].tv_nsec) &&
- nsec_special(times[1].tv_nsec))) {
+ /*
+ * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
+ * cases, we need to make an extra check that is not done by
+ * inode_change_ok().
+ */
+ if (((times[0].tv_nsec == UTIME_NOW &&
+ times[1].tv_nsec == UTIME_OMIT)
+ ||
+ (times[0].tv_nsec == UTIME_OMIT &&
+ times[1].tv_nsec == UTIME_NOW))
+ && !is_owner_or_cap(inode))
+ goto mnt_drop_write_and_out;
+ } else {
+
+ /*
+ * If times is NULL (or both times are UTIME_NOW),
+ * then we need to check permissions, because
+ * inode_change_ok() won't do it.
+ */
error = -EACCES;
if (IS_IMMUTABLE(inode))
goto mnt_drop_write_and_out;
if (!is_owner_or_cap(inode)) {
- if (f) {
- if (!(f->f_mode & FMODE_WRITE))
- goto mnt_drop_write_and_out;
- } else {
- error = vfs_permission(&nd, MAY_WRITE);
- if (error)
- goto mnt_drop_write_and_out;
- }
+ error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ goto mnt_drop_write_and_out;
}
}
mutex_lock(&inode->i_mutex);
@@ -169,14 +174,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
if (utimes) {
if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
return -EFAULT;
- if ((tstimes[0].tv_nsec == UTIME_OMIT ||
- tstimes[0].tv_nsec == UTIME_NOW) &&
- tstimes[0].tv_sec != 0)
- return -EINVAL;
- if ((tstimes[1].tv_nsec == UTIME_OMIT ||
- tstimes[1].tv_nsec == UTIME_NOW) &&
- tstimes[1].tv_sec != 0)
- return -EINVAL;
/* Nothing to do, we must not even check the path. */
if (tstimes[0].tv_nsec == UTIME_OMIT &&
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5..b546ba69be8 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
if (len == 0)
return -ENOENT;
- slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+ slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
if (slots == NULL)
return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *alias;
int err, table;
- lock_kernel();
+ lock_super(sb);
table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
brelse(sinfo.bh);
if (IS_ERR(inode)) {
- unlock_kernel();
+ unlock_super(sb);
return ERR_CAST(inode);
}
alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
dput(alias);
else {
iput(inode);
- unlock_kernel();
+ unlock_super(sb);
return alias;
}
}
error:
- unlock_kernel();
+ unlock_super(sb);
dentry->d_op = &vfat_dentry_ops[table];
dentry->d_time = dentry->d_parent->d_inode->i_version;
dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
struct timespec ts;
int err;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_dir_empty(inode);
if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -791,10 +792,11 @@ out:
static int vfat_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(dir, &dentry->d_name, &sinfo);
if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, cluster;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct timespec ts;
loff_t dotdot_i_pos, new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
+ struct super_block *sb = old_dir->i_sb;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
if (err)
goto out;
@@ -951,7 +954,7 @@ out:
brelse(sinfo.bh);
brelse(dotdot_bh);
brelse(old_sinfo.bh);
- unlock_kernel();
+ unlock_super(sb);
return err;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5105015a75a..98e0e86093b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
if (unlikely(page == NULL)) {
if (flags & XBF_READ_AHEAD) {
bp->b_page_count = i;
+ for (i = 0; i < bp->b_page_count; i++)
+ unlock_page(bp->b_pages[i]);
return -ENOMEM;
}
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
ASSERT(!PagePrivate(page));
if (!PageUptodate(page)) {
page_count--;
- if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) {
+ if (blocksize >= PAGE_CACHE_SIZE) {
+ if (flags & XBF_READ)
+ bp->b_flags |= _XBF_PAGE_LOCKED;
+ } else if (!PagePrivate(page)) {
if (test_page_region(page, offset, nbytes))
page_count++;
}
}
- unlock_page(page);
bp->b_pages[i] = page;
offset = 0;
}
+ if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+ for (i = 0; i < bp->b_page_count; i++)
+ unlock_page(bp->b_pages[i]);
+ }
+
if (page_count == bp->b_page_count)
bp->b_flags |= XBF_DONE;
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
bp->b_count_desired = len;
bp->b_buffer_length = buflen;
bp->b_flags |= XBF_MAPPED;
+ bp->b_flags &= ~_XBF_PAGE_LOCKED;
return 0;
}
@@ -1093,8 +1103,10 @@ _xfs_buf_ioend(
xfs_buf_t *bp,
int schedule)
{
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+ bp->b_flags &= ~_XBF_PAGE_LOCKED;
xfs_buf_ioend(bp, schedule);
+ }
}
STATIC void
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
+
+ if (bp->b_flags & _XBF_PAGE_LOCKED)
+ unlock_page(page);
} while (bvec >= bio->bi_io_vec);
_xfs_buf_ioend(bp, 1);
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
* filesystem block size is not smaller than the page size.
*/
if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
- (bp->b_flags & XBF_READ) &&
+ ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
+ (XBF_READ|_XBF_PAGE_LOCKED)) &&
(blocksize >= PAGE_CACHE_SIZE)) {
bio = bio_alloc(GFP_NOIO, 1);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 841d7883528..f948ec7ba9a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -66,6 +66,25 @@ typedef enum {
_XBF_PAGES = (1 << 18), /* backed by refcounted pages */
_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
_XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
+
+ /*
+ * Special flag for supporting metadata blocks smaller than a FSB.
+ *
+ * In this case we can have multiple xfs_buf_t on a single page and
+ * need to lock out concurrent xfs_buf_t readers as they only
+ * serialise access to the buffer.
+ *
+ * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+ * between reads of the page. Hence we can have one thread read the
+ * page and modify it, but then race with another thread that thinks
+ * the page is not up-to-date and hence reads it again.
+ *
+ * The result is that the first modifcation to the page is lost.
+ * This sort of AGF/AGI reading race can happen when unlinking inodes
+ * that require truncation and results in the AGI unlinked list
+ * modifications being lost.
+ */
+ _XBF_PAGE_LOCKED = (1 << 22),
} xfs_buf_flags_t;
typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4a..5f60363b934 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
return -xfs_release(XFS_I(inode));
}
+/*
+ * We ignore the datasync flag here because a datasync is effectively
+ * identical to an fsync. That is, datasync implies that we need to write
+ * only the metadata needed to be able to access the data that is written
+ * if we crash after the call completes. Hence if we are writing beyond
+ * EOF we have to log the inode size change as well, which makes it a
+ * full fsync. If we don't write beyond EOF, the inode core will be
+ * clean in memory and so we don't need to log the inode, just like
+ * fsync.
+ */
STATIC int
xfs_file_fsync(
struct file *filp,
struct dentry *dentry,
int datasync)
{
- int flags = FSYNC_WAIT;
-
- if (datasync)
- flags |= FSYNC_DATA;
xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
- return -xfs_fsync(XFS_I(dentry->d_inode), flags,
- (xfs_off_t)0, (xfs_off_t)-1);
+ return -xfs_fsync(XFS_I(dentry->d_inode));
}
/*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc..25eb2a9e8d9 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
/*
- * Flags to vop_fsync/reclaim.
- */
-#define FSYNC_NOWAIT 0 /* asynchronous flush */
-#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */
-#define FSYNC_DATA 0x4 /* synchronous fsync of data only */
-
-/*
* Tracking vnode activity.
*/
#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index cf0bb9c1d62..e569bf5d6cf 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
xfs_mount_t *mp = ip->i_mount;
xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
unsigned long first_index, mask;
+ unsigned long inodes_per_cluster;
int ilist_size;
xfs_inode_t **ilist;
xfs_inode_t *iq;
@@ -2985,8 +2986,9 @@ xfs_iflush_cluster(
ASSERT(pag->pagi_inodeok);
ASSERT(pag->pag_ici_init);
- ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
- ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+ inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+ ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
if (!ilist)
return 0;
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
read_lock(&pag->pag_ici_lock);
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
- first_index,
- XFS_INODE_CLUSTER_SIZE(mp));
+ first_index, inodes_per_cluster);
if (nr_found == 0)
goto out_free;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0e..ad3d26ddfe3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- /* If I'm the only one writing to this iclog, sync it to disk */
- if (atomic_read(&iclog->ic_refcnt) == 1) {
+ /*
+ * If I'm the only one writing to this iclog, sync it to disk.
+ * We need to do an atomic compare and decrement here to avoid
+ * racing with concurrent atomic_dec_and_lock() calls in
+ * xlog_state_release_iclog() when there is more than one
+ * reference to the iclog.
+ */
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+ /* we are the only one */
spin_unlock(&log->l_icloglock);
- if ((error = xlog_state_release_iclog(log, iclog)))
+ error = xlog_state_release_iclog(log, iclog);
+ if (error)
return error;
} else {
- atomic_dec(&iclog->ic_refcnt);
spin_unlock(&log->l_icloglock);
}
goto restart;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4b..e475e3717eb 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
/*
* xfs_fsync
*
- * This is called to sync the inode and its data out to disk.
- * We need to hold the I/O lock while flushing the data, and
- * the inode lock while flushing the inode. The inode lock CANNOT
- * be held while flushing the data, so acquire after we're done
- * with that.
+ * This is called to sync the inode and its data out to disk. We need to hold
+ * the I/O lock while flushing the data, and the inode lock while flushing the
+ * inode. The inode lock CANNOT be held while flushing the data, so acquire
+ * after we're done with that.
*/
int
xfs_fsync(
- xfs_inode_t *ip,
- int flag,
- xfs_off_t start,
- xfs_off_t stop)
+ xfs_inode_t *ip)
{
xfs_trans_t *tp;
int error;
@@ -875,103 +871,79 @@ xfs_fsync(
xfs_itrace_entry(ip);
- ASSERT(start >= 0 && stop >= -1);
-
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return XFS_ERROR(EIO);
- if (flag & FSYNC_DATA)
- filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+ /* capture size updates in I/O completion before writing the inode. */
+ error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+ if (error)
+ return XFS_ERROR(error);
/*
- * We always need to make sure that the required inode state
- * is safe on disk. The vnode might be clean but because
- * of committed transactions that haven't hit the disk yet.
- * Likewise, there could be unflushed non-transactional
- * changes to the inode core that have to go to disk.
+ * We always need to make sure that the required inode state is safe on
+ * disk. The vnode might be clean but we still might need to force the
+ * log because of committed transactions that haven't hit the disk yet.
+ * Likewise, there could be unflushed non-transactional changes to the
+ * inode core that have to go to disk and this requires us to issue
+ * a synchronous transaction to capture these changes correctly.
*
- * The following code depends on one assumption: that
- * any transaction that changes an inode logs the core
- * because it has to change some field in the inode core
- * (typically nextents or nblocks). That assumption
- * implies that any transactions against an inode will
- * catch any non-transactional updates. If inode-altering
- * transactions exist that violate this assumption, the
- * code breaks. Right now, it figures that if the involved
- * update_* field is clear and the inode is unpinned, the
- * inode is clean. Either it's been flushed or it's been
- * committed and the commit has hit the disk unpinning the inode.
- * (Note that xfs_inode_item_format() called at commit clears
- * the update_* fields.)
+ * This code relies on the assumption that if the update_* fields
+ * of the inode are clear and the inode is unpinned then it is clean
+ * and no action is required.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- /* If we are flushing data then we care about update_size
- * being set, otherwise we care about update_core
- */
- if ((flag & FSYNC_DATA) ?
- (ip->i_update_size == 0) :
- (ip->i_update_core == 0)) {
+ if (!(ip->i_update_size || ip->i_update_core)) {
/*
- * Timestamps/size haven't changed since last inode
- * flush or inode transaction commit. That means
- * either nothing got written or a transaction
- * committed which caught the updates. If the
- * latter happened and the transaction hasn't
- * hit the disk yet, the inode will be still
- * be pinned. If it is, force the log.
+ * Timestamps/size haven't changed since last inode flush or
+ * inode transaction commit. That means either nothing got
+ * written or a transaction committed which caught the updates.
+ * If the latter happened and the transaction hasn't hit the
+ * disk yet, the inode will be still be pinned. If it is,
+ * force the log.
*/
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
- _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
- XFS_LOG_FORCE |
- ((flag & FSYNC_WAIT)
- ? XFS_LOG_SYNC : 0),
+ error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+ XFS_LOG_FORCE | XFS_LOG_SYNC,
&log_flushed);
} else {
/*
- * If the inode is not pinned and nothing
- * has changed we don't need to flush the
- * cache.
+ * If the inode is not pinned and nothing has changed
+ * we don't need to flush the cache.
*/
changed = 0;
}
- error = 0;
} else {
/*
- * Kick off a transaction to log the inode
- * core to get the updates. Make it
- * sync if FSYNC_WAIT is passed in (which
- * is done by everybody but specfs). The
- * sync transaction will also force the log.
+ * Kick off a transaction to log the inode core to get the
+ * updates. The sync transaction will also force the log.
*/
xfs_iunlock(ip, XFS_ILOCK_SHARED);
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_FSYNC_TS_LOG_RES(ip->i_mount),
- 0, 0, 0))) {
+ error = xfs_trans_reserve(tp, 0,
+ XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+ if (error) {
xfs_trans_cancel(tp, 0);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
/*
- * Note - it's possible that we might have pushed
- * ourselves out of the way during trans_reserve
- * which would flush the inode. But there's no
- * guarantee that the inode buffer has actually
- * gone out yet (it's delwri). Plus the buffer
- * could be pinned anyway if it's part of an
- * inode in another recent transaction. So we
- * play it safe and fire off the transaction anyway.
+ * Note - it's possible that we might have pushed ourselves out
+ * of the way during trans_reserve which would flush the inode.
+ * But there's no guarantee that the inode buffer has actually
+ * gone out yet (it's delwri). Plus the buffer could be pinned
+ * anyway if it's part of an inode in another recent
+ * transaction. So we play it safe and fire off the
+ * transaction anyway.
*/
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (flag & FSYNC_WAIT)
- xfs_trans_set_sync(tp);
+ xfs_trans_set_sync(tp);
error = _xfs_trans_commit(tp, 0, &log_flushed);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e2..57335ba4ce5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
struct cred *credp);
int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
- xfs_off_t stop);
+int xfs_fsync(struct xfs_inode *ip);
int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,