aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/char_dev.c6
-rw-r--r--fs/coda/psdev.c8
-rw-r--r--fs/compat.c4
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/configfs/mount.c13
-rw-r--r--fs/debugfs/inode.c13
-rw-r--r--fs/dlm/lockspace.c50
-rw-r--r--fs/ecryptfs/main.c129
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/inode.c26
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/bmap.c37
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/daemon.c50
-rw-r--r--fs/gfs2/daemon.h1
-rw-r--r--fs/gfs2/dir.c4
-rw-r--r--fs/gfs2/eaops.c84
-rw-r--r--fs/gfs2/eattr.c2
-rw-r--r--fs/gfs2/glock.c83
-rw-r--r--fs/gfs2/glops.c110
-rw-r--r--fs/gfs2/incore.h47
-rw-r--r--fs/gfs2/inode.c41
-rw-r--r--fs/gfs2/inode.h12
-rw-r--r--fs/gfs2/locking/dlm/mount.c5
-rw-r--r--fs/gfs2/locking/dlm/plock.c18
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c38
-rw-r--r--fs/gfs2/locking/dlm/thread.c9
-rw-r--r--fs/gfs2/log.c119
-rw-r--r--fs/gfs2/log.h14
-rw-r--r--fs/gfs2/lops.c71
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/meta_io.c97
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c649
-rw-r--r--fs/gfs2/ops_address.h7
-rw-r--r--fs/gfs2/ops_file.c229
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c73
-rw-r--r--fs/gfs2/ops_inode.c20
-rw-r--r--fs/gfs2/ops_inode.h6
-rw-r--r--fs/gfs2/ops_super.c1
-rw-r--r--fs/gfs2/ops_vm.c169
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c29
-rw-r--r--fs/gfs2/recovery.c18
-rw-r--r--fs/gfs2/rgrp.c104
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c25
-rw-r--r--fs/gfs2/sys.c36
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h1
-rw-r--r--fs/hfs/btree.c7
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jfs/jfs_dtree.c27
-rw-r--r--fs/jfs/jfs_dtree.h4
-rw-r--r--fs/jfs/jfs_imap.c4
-rw-r--r--fs/jfs/jfs_logmgr.c34
-rw-r--r--fs/jfs/jfs_metapage.c43
-rw-r--r--fs/jfs/jfs_mount.c2
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c11
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfsxdr.c5
-rw-r--r--fs/ocfs2/Makefile5
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c137
-rw-r--r--fs/ocfs2/buffer_head_io.c65
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/sys.c83
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h8
-rw-r--r--fs/ocfs2/cluster/ver.c2
-rw-r--r--fs/ocfs2/dcache.c8
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c19
-rw-r--r--fs/ocfs2/dlm/dlmver.c2
-rw-r--r--fs/ocfs2/dlmglue.c546
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/file.c163
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c80
-rw-r--r--fs/ocfs2/heartbeat.h2
-rw-r--r--fs/ocfs2/inode.c84
-rw-r--r--fs/ocfs2/inode.h10
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/localalloc.c50
-rw-r--r--fs/ocfs2/locks.c125
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/vote.h)29
-rw-r--r--fs/ocfs2/mmap.c17
-rw-r--r--fs/ocfs2/namei.c66
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_fs.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/resize.c634
-rw-r--r--fs/ocfs2/resize.h32
-rw-r--r--fs/ocfs2/slot_map.c19
-rw-r--r--fs/ocfs2/slot_map.h2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/suballoc.h8
-rw-r--r--fs/ocfs2/super.c140
-rw-r--r--fs/ocfs2/sysfile.c2
-rw-r--r--fs/ocfs2/ver.c2
-rw-r--r--fs/ocfs2/vote.c756
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/partitions/check.c327
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c78
-rw-r--r--fs/read_write.c63
-rw-r--r--fs/splice.c8
-rw-r--r--fs/sysfs/dir.c10
-rw-r--r--fs/sysfs/file.c67
-rw-r--r--fs/sysfs/symlink.c88
126 files changed, 3403 insertions, 3307 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 487236c6583..b6df18f1f67 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -440,14 +440,8 @@ config OCFS2_FS
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
- Note: Features which OCFS2 does not support yet:
- - extended attributes
- - quotas
- - cluster aware flock
- - Directory change notification (F_NOTIFY)
- - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
- - POSIX ACLs
- - readpages / writepages (not user visible)
+ For more information on OCFS2, see the file
+ <file:Documentation/filesystems/ocfs2.txt>.
config OCFS2_DEBUG_MASKLOG
bool "OCFS2 logging support"
@@ -1028,8 +1022,8 @@ config HUGETLB_PAGE
def_bool HUGETLBFS
config CONFIGFS_FS
- tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
- depends on SYSFS && EXPERIMENTAL
+ tristate "Userspace-driven configuration filesystem"
+ depends on SYSFS
help
configfs is a ram-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based
@@ -1112,8 +1106,8 @@ config HFS_FS
help
If you say Y here, you will be able to mount Macintosh-formatted
floppy disks and hard drive partitions with full read-write access.
- Please read <file:fs/hfs/HFS.txt> to learn about the available mount
- options.
+ Please read <file:Documentation/filesystems/hfs.txt> to learn about
+ the available mount options.
To compile this file system support as a module, choose M here: the
module will be called hfs.
@@ -2130,4 +2124,3 @@ source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
endmenu
-
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c5522..e48a630ae26 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -738,9 +738,9 @@ EXPORT_SYMBOL(bd_release);
static struct kobject *bdev_get_kobj(struct block_device *bdev)
{
if (bdev->bd_contains != bdev)
- return kobject_get(&bdev->bd_part->kobj);
+ return kobject_get(&bdev->bd_part->dev.kobj);
else
- return kobject_get(&bdev->bd_disk->kobj);
+ return kobject_get(&bdev->bd_disk->dev.kobj);
}
static struct kobject *bdev_get_holder(struct block_device *bdev)
@@ -1176,7 +1176,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
ret = -ENXIO;
goto out_first;
}
- kobject_get(&p->kobj);
+ kobject_get(&p->dev.kobj);
bdev->bd_part = p;
bd_set_size(bdev, (loff_t) p->nr_sects << 9);
}
@@ -1299,7 +1299,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
module_put(owner);
if (bdev->bd_contains != bdev) {
- kobject_put(&bdev->bd_part->kobj);
+ kobject_put(&bdev->bd_part->dev.kobj);
bdev->bd_part = NULL;
}
bdev->bd_disk = NULL;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c3bfa76765c..2c7a8b5b459 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -510,9 +510,8 @@ struct cdev *cdev_alloc(void)
{
struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
if (p) {
- p->kobj.ktype = &ktype_cdev_dynamic;
INIT_LIST_HEAD(&p->list);
- kobject_init(&p->kobj);
+ kobject_init(&p->kobj, &ktype_cdev_dynamic);
}
return p;
}
@@ -529,8 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops)
{
memset(cdev, 0, sizeof *cdev);
INIT_LIST_HEAD(&cdev->list);
- cdev->kobj.ktype = &ktype_cdev_default;
- kobject_init(&cdev->kobj);
+ kobject_init(&cdev->kobj, &ktype_cdev_default);
cdev->ops = fops;
}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index dcc6aead70f..e3eb3556622 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,8 @@ static int init_coda_psdev(void)
goto out_chrdev;
}
for (i = 0; i < MAX_CODADEVS; i++)
- class_device_create(coda_psdev_class, NULL,
- MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
+ device_create(coda_psdev_class, NULL,
+ MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
coda_sysctl_init();
goto out;
@@ -405,7 +405,7 @@ static int __init init_coda(void)
return 0;
out:
for (i = 0; i < MAX_CODADEVS; i++)
- class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+ device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
class_destroy(coda_psdev_class);
unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
coda_sysctl_clean();
@@ -424,7 +424,7 @@ static void __exit exit_coda(void)
printk("coda: failed to unregister filesystem\n");
}
for (i = 0; i < MAX_CODADEVS; i++)
- class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+ device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
class_destroy(coda_psdev_class);
unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
coda_sysctl_clean();
diff --git a/fs/compat.c b/fs/compat.c
index 15078ce4c04..5216c3fd751 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1104,10 +1104,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
if (ret < 0)
goto out;
- ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
- if (ret)
- goto out;
-
fnv = NULL;
if (type == READ) {
fn = file->f_op->read;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 50ed691098b..a48dc7dd876 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
* That said, taking our i_mutex is closer to mkdir
* emulation, and shouldn't hurt.
*/
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
for (i = 0; group->default_groups[i]; i++) {
new_group = group->default_groups[i];
@@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
sd = configfs_sb->s_root->d_fsdata;
link_group(to_config_group(sd->s_element), group);
- mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+ mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+ I_MUTEX_PARENT);
name.name = group->cg_item.ci_name;
name.len = strlen(name.name);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index a3658f9a082..397cb503a18 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
int error = 0;
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
mutex_unlock(&dir->d_inode->i_mutex);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3bf0278ea84..de3b31d0a37 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
}
-static decl_subsys(config, NULL, NULL);
+static struct kobject *config_kobj;
static int __init configfs_init(void)
{
@@ -140,9 +140,8 @@ static int __init configfs_init(void)
if (!configfs_dir_cachep)
goto out;
- kobj_set_kset_s(&config_subsys, kernel_subsys);
- err = subsystem_register(&config_subsys);
- if (err) {
+ config_kobj = kobject_create_and_add("config", kernel_kobj);
+ if (!config_kobj) {
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
goto out;
@@ -151,7 +150,7 @@ static int __init configfs_init(void)
err = register_filesystem(&configfs_fs_type);
if (err) {
printk(KERN_ERR "configfs: Unable to register filesystem!\n");
- subsystem_unregister(&config_subsys);
+ kobject_put(config_kobj);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
goto out;
@@ -160,7 +159,7 @@ static int __init configfs_init(void)
err = configfs_inode_init();
if (err) {
unregister_filesystem(&configfs_fs_type);
- subsystem_unregister(&config_subsys);
+ kobject_put(config_kobj);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
}
@@ -171,7 +170,7 @@ out:
static void __exit configfs_exit(void)
{
unregister_filesystem(&configfs_fs_type);
- subsystem_unregister(&config_subsys);
+ kobject_put(config_kobj);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
configfs_inode_exit();
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6a713b33992..d26e2826ba5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,20 +426,19 @@ exit:
}
EXPORT_SYMBOL_GPL(debugfs_rename);
-static decl_subsys(debug, NULL, NULL);
+static struct kobject *debug_kobj;
static int __init debugfs_init(void)
{
int retval;
- kobj_set_kset_s(&debug_subsys, kernel_subsys);
- retval = subsystem_register(&debug_subsys);
- if (retval)
- return retval;
+ debug_kobj = kobject_create_and_add("debug", kernel_kobj);
+ if (!debug_kobj)
+ return -EINVAL;
retval = register_filesystem(&debug_fs_type);
if (retval)
- subsystem_unregister(&debug_subsys);
+ kobject_put(debug_kobj);
return retval;
}
@@ -447,7 +446,7 @@ static void __exit debugfs_exit(void)
{
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
unregister_filesystem(&debug_fs_type);
- subsystem_unregister(&debug_subsys);
+ kobject_put(debug_kobj);
}
core_initcall(debugfs_init);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 6353a838452..5c108c49cb8 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -166,26 +166,7 @@ static struct kobj_type dlm_ktype = {
.release = lockspace_kobj_release,
};
-static struct kset dlm_kset = {
- .ktype = &dlm_ktype,
-};
-
-static int kobject_setup(struct dlm_ls *ls)
-{
- char lsname[DLM_LOCKSPACE_LEN];
- int error;
-
- memset(lsname, 0, DLM_LOCKSPACE_LEN);
- snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
-
- error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
- if (error)
- return error;
-
- ls->ls_kobj.kset = &dlm_kset;
- ls->ls_kobj.ktype = &dlm_ktype;
- return 0;
-}
+static struct kset *dlm_kset;
static int do_uevent(struct dlm_ls *ls, int in)
{
@@ -220,24 +201,22 @@ static int do_uevent(struct dlm_ls *ls, int in)
int dlm_lockspace_init(void)
{
- int error;
-
ls_count = 0;
mutex_init(&ls_lock);
INIT_LIST_HEAD(&lslist);
spin_lock_init(&lslist_lock);
- kobject_set_name(&dlm_kset.kobj, "dlm");
- kobj_set_kset_s(&dlm_kset, kernel_subsys);
- error = kset_register(&dlm_kset);
- if (error)
- printk("dlm_lockspace_init: cannot register kset %d\n", error);
- return error;
+ dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
+ if (!dlm_kset) {
+ printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
+ return -ENOMEM;
+ }
+ return 0;
}
void dlm_lockspace_exit(void)
{
- kset_unregister(&dlm_kset);
+ kset_unregister(dlm_kset);
}
static int dlm_scand(void *data)
@@ -549,13 +528,12 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
goto out_delist;
}
- error = kobject_setup(ls);
- if (error)
- goto out_stop;
-
- error = kobject_register(&ls->ls_kobj);
+ ls->ls_kobj.kset = dlm_kset;
+ error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
+ "%s", ls->ls_name);
if (error)
goto out_stop;
+ kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
/* let kobject handle freeing of ls if there's an error */
do_unreg = 1;
@@ -601,7 +579,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
kfree(ls->ls_rsbtbl);
out_lsfree:
if (do_unreg)
- kobject_unregister(&ls->ls_kobj);
+ kobject_put(&ls->ls_kobj);
else
kfree(ls);
out:
@@ -750,7 +728,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_clear_members(ls);
dlm_clear_members_gone(ls);
kfree(ls->ls_node_array);
- kobject_unregister(&ls->ls_kobj);
+ kobject_put(&ls->ls_kobj);
/* The ls structure will be freed when the kobject is done with */
mutex_lock(&ls_lock);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e5580bcb923..0249aa4ae18 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,127 +734,40 @@ static int ecryptfs_init_kmem_caches(void)
return 0;
}
-struct ecryptfs_obj {
- char *name;
- struct list_head slot_list;
- struct kobject kobj;
-};
-
-struct ecryptfs_attribute {
- struct attribute attr;
- ssize_t(*show) (struct ecryptfs_obj *, char *);
- ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
-};
+static struct kobject *ecryptfs_kobj;
-static ssize_t
-ecryptfs_attr_store(struct kobject *kobj,
- struct attribute *attr, const char *buf, size_t len)
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buff)
{
- struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
- kobj);
- struct ecryptfs_attribute *attribute =
- container_of(attr, struct ecryptfs_attribute, attr);
-
- return (attribute->store ? attribute->store(obj, buf, len) : 0);
+ return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
}
-static ssize_t
-ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
- kobj);
- struct ecryptfs_attribute *attribute =
- container_of(attr, struct ecryptfs_attribute, attr);
-
- return (attribute->show ? attribute->show(obj, buf) : 0);
-}
+static struct kobj_attribute version_attr = __ATTR_RO(version);
-static struct sysfs_ops ecryptfs_sysfs_ops = {
- .show = ecryptfs_attr_show,
- .store = ecryptfs_attr_store
+static struct attribute *attributes[] = {
+ &version_attr.attr,
+ NULL,
};
-static struct kobj_type ecryptfs_ktype = {
- .sysfs_ops = &ecryptfs_sysfs_ops
+static struct attribute_group attr_group = {
+ .attrs = attributes,
};
-static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
-
-static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
-{
- return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
-}
-
-static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
-
-static struct ecryptfs_version_str_map_elem {
- u32 flag;
- char *str;
-} ecryptfs_version_str_map[] = {
- {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
- {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
- {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
- {ECRYPTFS_VERSIONING_POLICY, "policy"},
- {ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"},
- {ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
-};
-
-static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
-{
- int i;
- int remaining = PAGE_SIZE;
- int total_written = 0;
-
- buff[0] = '\0';
- for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
- int entry_size;
-
- if (!(ECRYPTFS_VERSIONING_MASK
- & ecryptfs_version_str_map[i].flag))
- continue;
- entry_size = strlen(ecryptfs_version_str_map[i].str);
- if ((entry_size + 2) > remaining)
- goto out;
- memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
- buff[entry_size++] = '\n';
- buff[entry_size] = '\0';
- buff += entry_size;
- total_written += entry_size;
- remaining -= entry_size;
- }
-out:
- return total_written;
-}
-
-static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
-
static int do_sysfs_registration(void)
{
int rc;
- rc = subsystem_register(&ecryptfs_subsys);
- if (rc) {
- printk(KERN_ERR
- "Unable to register ecryptfs sysfs subsystem\n");
- goto out;
- }
- rc = sysfs_create_file(&ecryptfs_subsys.kobj,
- &sysfs_attr_version.attr);
- if (rc) {
- printk(KERN_ERR
- "Unable to create ecryptfs version attribute\n");
- subsystem_unregister(&ecryptfs_subsys);
+ ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
+ if (!ecryptfs_kobj) {
+ printk(KERN_ERR "Unable to create ecryptfs kset\n");
+ rc = -ENOMEM;
goto out;
}
- rc = sysfs_create_file(&ecryptfs_subsys.kobj,
- &sysfs_attr_version_str.attr);
+ rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
if (rc) {
printk(KERN_ERR
- "Unable to create ecryptfs version_str attribute\n");
- sysfs_remove_file(&ecryptfs_subsys.kobj,
- &sysfs_attr_version.attr);
- subsystem_unregister(&ecryptfs_subsys);
- goto out;
+ "Unable to create ecryptfs version attributes\n");
+ kobject_put(ecryptfs_kobj);
}
out:
return rc;
@@ -862,11 +775,8 @@ out:
static void do_sysfs_unregistration(void)
{
- sysfs_remove_file(&ecryptfs_subsys.kobj,
- &sysfs_attr_version.attr);
- sysfs_remove_file(&ecryptfs_subsys.kobj,
- &sysfs_attr_version_str.attr);
- subsystem_unregister(&ecryptfs_subsys);
+ sysfs_remove_group(ecryptfs_kobj, &attr_group);
+ kobject_put(ecryptfs_kobj);
}
static int __init ecryptfs_init(void)
@@ -894,7 +804,6 @@ static int __init ecryptfs_init(void)
printk(KERN_ERR "Failed to register filesystem\n");
goto out_free_kmem_caches;
}
- kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
rc = do_sysfs_registration();
if (rc) {
printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0fca82021d7..300324bd563 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -482,8 +482,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0)
break;
}
- if (!list_empty(&sb->s_more_io))
- wbc->more_io = 1;
return; /* Leave any unwritten inodes on s_io */
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 84f9f7dfdf5..e5e80d1a468 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -744,9 +744,6 @@ static inline void unregister_fuseblk(void)
}
#endif
-static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, NULL, NULL);
-
static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
{
struct inode * inode = foo;
@@ -791,32 +788,37 @@ static void fuse_fs_cleanup(void)
kmem_cache_destroy(fuse_inode_cachep);
}
+static struct kobject *fuse_kobj;
+static struct kobject *connections_kobj;
+
static int fuse_sysfs_init(void)
{
int err;
- kobj_set_kset_s(&fuse_subsys, fs_subsys);
- err = subsystem_register(&fuse_subsys);
- if (err)
+ fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
+ if (!fuse_kobj) {
+ err = -ENOMEM;
goto out_err;
+ }
- kobj_set_kset_s(&connections_subsys, fuse_subsys);
- err = subsystem_register(&connections_subsys);
- if (err)
+ connections_kobj = kobject_create_and_add("connections", fuse_kobj);
+ if (!connections_kobj) {
+ err = -ENOMEM;
goto out_fuse_unregister;
+ }
return 0;
out_fuse_unregister:
- subsystem_unregister(&fuse_subsys);
+ kobject_put(fuse_kobj);
out_err:
return err;
}
static void fuse_sysfs_cleanup(void)
{
- subsystem_unregister(&connections_subsys);
- subsystem_unregister(&fuse_subsys);
+ kobject_put(connections_kobj);
+ kobject_put(fuse_kobj);
}
static int __init fuse_init(void)
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 04ad0caebed..8fff11058ce 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o
gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
- ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+ ops_fstype.o ops_inode.o ops_super.o quota.o \
recovery.o rgrp.o super.o sys.o trans.o util.o
obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93fa427bb5f..e4effc47abf 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -59,7 +59,6 @@ struct strip_mine {
static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
u64 block, struct page *page)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct buffer_head *bh;
int release = 0;
@@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
set_buffer_uptodate(bh);
if (!gfs2_is_jdata(ip))
mark_buffer_dirty(bh);
- if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ if (!gfs2_is_writeback(ip))
gfs2_trans_add_bh(ip->i_gl, bh, 0);
if (release) {
@@ -453,8 +452,8 @@ static inline void bmap_unlock(struct inode *inode, int create)
* Returns: errno
*/
-int gfs2_block_map(struct inode *inode, u64 lblock, int create,
- struct buffer_head *bh_map)
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_map, int create)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -470,6 +469,7 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
struct metapath mp;
u64 size;
+ struct buffer_head *dibh = NULL;
BUG_ON(maxlen == 0);
@@ -500,6 +500,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out_fail;
+ dibh = bh;
+ get_bh(dibh);
for (x = 0; x < end_of_metadata; x++) {
lookup_block(ip, bh, x, &mp, create, &new, &dblock);
@@ -518,13 +520,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
if (boundary)
set_buffer_boundary(bh_map);
if (new) {
- struct buffer_head *dibh;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- }
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(ip, dibh->b_data);
set_buffer_new(bh_map);
goto out_brelse;
}
@@ -545,6 +542,8 @@ out_brelse:
out_ok:
error = 0;
out_fail:
+ if (dibh)
+ brelse(dibh);
bmap_unlock(inode, create);
return error;
}
@@ -560,7 +559,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
BUG_ON(!new);
bh.b_size = 1 << (inode->i_blkbits + 5);
- ret = gfs2_block_map(inode, lblock, create, &bh);
+ ret = gfs2_block_map(inode, lblock, &bh, create);
*extlen = bh.b_size >> inode->i_blkbits;
*dblock = bh.b_blocknr;
if (buffer_new(&bh))
@@ -684,7 +683,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (metadata)
revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+ error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
if (error)
return error;
@@ -786,7 +785,7 @@ out_rg_gunlock:
out_rlist:
gfs2_rlist_free(&rlist);
out:
- gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+ gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
return error;
}
@@ -879,7 +878,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
loff_t from = inode->i_size;
unsigned long index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -911,7 +909,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
err = 0;
if (!buffer_mapped(bh)) {
- gfs2_get_block(inode, iblock, bh, 0);
+ gfs2_block_map(inode, iblock, bh, 0);
/* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(bh))
goto unlock;
@@ -931,7 +929,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
err = 0;
}
- if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ if (!gfs2_is_writeback(ip))
gfs2_trans_add_bh(ip->i_gl, bh, 0);
zero_user_page(page, offset, length, KM_USER0);
@@ -1224,8 +1222,13 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
do_div(lblock_stop, bsize);
} else {
unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+ u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
lblock = offset >> shift;
lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+ if (lblock_stop > end_of_file) {
+ *alloc_required = 1;
+ return 0;
+ }
}
for (; lblock < lblock_stop; lblock += extlen) {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index ac2fd04370d..4e6cde2943b 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -15,7 +15,7 @@ struct gfs2_inode;
struct page;
int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
+int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3731ab0771d..e51991947d2 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -83,56 +83,6 @@ int gfs2_recoverd(void *data)
}
/**
- * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
- * @sdp: Pointer to GFS2 superblock
- *
- * Also, periodically check to make sure that we're using the most recent
- * journal index.
- */
-
-int gfs2_logd(void *data)
-{
- struct gfs2_sbd *sdp = data;
- struct gfs2_holder ji_gh;
- unsigned long t;
- int need_flush;
-
- while (!kthread_should_stop()) {
- /* Advance the log tail */
-
- t = sdp->sd_log_flush_time +
- gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
-
- gfs2_ail1_empty(sdp, DIO_ALL);
- gfs2_log_lock(sdp);
- need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
- gfs2_log_unlock(sdp);
- if (need_flush || time_after_eq(jiffies, t)) {
- gfs2_log_flush(sdp, NULL);
- sdp->sd_log_flush_time = jiffies;
- }
-
- /* Check for latest journal index */
-
- t = sdp->sd_jindex_refresh_time +
- gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
-
- if (time_after_eq(jiffies, t)) {
- if (!gfs2_jindex_hold(sdp, &ji_gh))
- gfs2_glock_dq_uninit(&ji_gh);
- sdp->sd_jindex_refresh_time = jiffies;
- }
-
- t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
- if (freezing(current))
- refrigerator();
- schedule_timeout_interruptible(t);
- }
-
- return 0;
-}
-
-/**
* gfs2_quotad - Write cached quota changes into the quota file
* @sdp: Pointer to GFS2 superblock
*
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 0de9b355795..4be084fb6a6 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -12,7 +12,6 @@
int gfs2_glockd(void *data);
int gfs2_recoverd(void *data);
-int gfs2_logd(void *data);
int gfs2_quotad(void *data);
#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9949bb746a5..57e2ed932ad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1876,7 +1876,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (error)
goto out;
- error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+ error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
if (error)
goto out_qs;
@@ -1949,7 +1949,7 @@ out_rg_gunlock:
gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
out_rlist:
gfs2_rlist_free(&rlist);
- gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+ gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
out_qs:
gfs2_quota_unhold(dip);
out:
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index aa8dbf303f6..f114ba2b355 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -56,46 +56,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
return type;
}
-static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct inode *inode = &ip->i_inode;
- int error = permission(inode, MAY_READ, NULL);
- if (error)
- return error;
-
- return gfs2_ea_get_i(ip, er);
-}
-
-static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct inode *inode = &ip->i_inode;
-
- if (S_ISREG(inode->i_mode) ||
- (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
- int error = permission(inode, MAY_WRITE, NULL);
- if (error)
- return error;
- } else
- return -EPERM;
-
- return gfs2_ea_set_i(ip, er);
-}
-
-static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct inode *inode = &ip->i_inode;
-
- if (S_ISREG(inode->i_mode) ||
- (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
- int error = permission(inode, MAY_WRITE, NULL);
- if (error)
- return error;
- } else
- return -EPERM;
-
- return gfs2_ea_remove_i(ip, er);
-}
-
static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
{
if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
@@ -108,8 +68,6 @@ static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
return -EOPNOTSUPP;
-
-
return gfs2_ea_get_i(ip, er);
}
@@ -170,40 +128,10 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
return gfs2_ea_remove_i(ip, er);
}
-static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct inode *inode = &ip->i_inode;
- int error = permission(inode, MAY_READ, NULL);
- if (error)
- return error;
-
- return gfs2_ea_get_i(ip, er);
-}
-
-static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct inode *inode = &ip->i_inode;
- int error = permission(inode, MAY_WRITE, NULL);
- if (error)
- return error;
-
- return gfs2_ea_set_i(ip, er);
-}
-
-static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct inode *inode = &ip->i_inode;
- int error = permission(inode, MAY_WRITE, NULL);
- if (error)
- return error;
-
- return gfs2_ea_remove_i(ip, er);
-}
-
static const struct gfs2_eattr_operations gfs2_user_eaops = {
- .eo_get = user_eo_get,
- .eo_set = user_eo_set,
- .eo_remove = user_eo_remove,
+ .eo_get = gfs2_ea_get_i,
+ .eo_set = gfs2_ea_set_i,
+ .eo_remove = gfs2_ea_remove_i,
.eo_name = "user",
};
@@ -215,9 +143,9 @@ const struct gfs2_eattr_operations gfs2_system_eaops = {
};
static const struct gfs2_eattr_operations gfs2_security_eaops = {
- .eo_get = security_eo_get,
- .eo_set = security_eo_set,
- .eo_remove = security_eo_remove,
+ .eo_get = gfs2_ea_get_i,
+ .eo_set = gfs2_ea_set_i,
+ .eo_remove = gfs2_ea_remove_i,
.eo_name = "security",
};
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 2a7435b5c4d..bee99704ea1 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -1418,7 +1418,7 @@ out:
static int ea_dealloc_block(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_rgrpd *rgd;
struct buffer_head *dibh;
int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a37efe4aae6..80e09c50590 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -217,7 +217,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
if (atomic_dec_and_test(&gl->gl_ref)) {
hlist_del(&gl->gl_list);
write_unlock(gl_lock_addr(gl->gl_hash));
- BUG_ON(spin_is_locked(&gl->gl_spin));
gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
gfs2_assert(sdp, list_empty(&gl->gl_holders));
@@ -346,7 +345,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_object = NULL;
gl->gl_sbd = sdp;
gl->gl_aspace = NULL;
- lops_init_le(&gl->gl_le, &gfs2_glock_lops);
INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
/* If this glock protects actual on-disk data or metadata blocks,
@@ -461,7 +459,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
static void gfs2_demote_wake(struct gfs2_glock *gl)
{
- BUG_ON(!spin_is_locked(&gl->gl_spin));
gl->gl_demote_state = LM_ST_EXCLUSIVE;
clear_bit(GLF_DEMOTE, &gl->gl_flags);
smp_mb__after_clear_bit();
@@ -507,21 +504,12 @@ static int rq_mutex(struct gfs2_holder *gh)
static int rq_promote(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
if (list_empty(&gl->gl_holders)) {
gl->gl_req_gh = gh;
set_bit(GLF_LOCK, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
-
- if (atomic_read(&sdp->sd_reclaim_count) >
- gfs2_tune_get(sdp, gt_reclaim_limit) &&
- !(gh->gh_flags & LM_FLAG_PRIORITY)) {
- gfs2_reclaim_glock(sdp);
- gfs2_reclaim_glock(sdp);
- }
-
gfs2_glock_xmote_th(gh->gh_gl, gh);
spin_lock(&gl->gl_spin);
}
@@ -567,7 +555,10 @@ static int rq_demote(struct gfs2_glock *gl)
gfs2_demote_wake(gl);
return 0;
}
+
set_bit(GLF_LOCK, &gl->gl_flags);
+ set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+
if (gl->gl_demote_state == LM_ST_UNLOCKED ||
gl->gl_state != LM_ST_EXCLUSIVE) {
spin_unlock(&gl->gl_spin);
@@ -576,7 +567,9 @@ static int rq_demote(struct gfs2_glock *gl)
spin_unlock(&gl->gl_spin);
gfs2_glock_xmote_th(gl, NULL);
}
+
spin_lock(&gl->gl_spin);
+ clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
return 0;
}
@@ -598,23 +591,18 @@ static void run_queue(struct gfs2_glock *gl)
if (!list_empty(&gl->gl_waiters1)) {
gh = list_entry(gl->gl_waiters1.next,
struct gfs2_holder, gh_list);
-
- if (test_bit(HIF_MUTEX, &gh->gh_iflags))
- blocked = rq_mutex(gh);
- else
- gfs2_assert_warn(gl->gl_sbd, 0);
-
+ blocked = rq_mutex(gh);
} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
blocked = rq_demote(gl);
+ if (gl->gl_waiters2 && !blocked) {
+ set_bit(GLF_DEMOTE, &gl->gl_flags);
+ gl->gl_demote_state = LM_ST_UNLOCKED;
+ }
+ gl->gl_waiters2 = 0;
} else if (!list_empty(&gl->gl_waiters3)) {
gh = list_entry(gl->gl_waiters3.next,
struct gfs2_holder, gh_list);
-
- if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
- blocked = rq_promote(gh);
- else
- gfs2_assert_warn(gl->gl_sbd, 0);
-
+ blocked = rq_promote(gh);
} else
break;
@@ -632,27 +620,21 @@ static void run_queue(struct gfs2_glock *gl)
static void gfs2_glmutex_lock(struct gfs2_glock *gl)
{
- struct gfs2_holder gh;
-
- gfs2_holder_init(gl, 0, 0, &gh);
- set_bit(HIF_MUTEX, &gh.gh_iflags);
- if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
- BUG();
-
spin_lock(&gl->gl_spin);
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ struct gfs2_holder gh;
+
+ gfs2_holder_init(gl, 0, 0, &gh);
+ set_bit(HIF_WAIT, &gh.gh_iflags);
list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+ spin_unlock(&gl->gl_spin);
+ wait_on_holder(&gh);
+ gfs2_holder_uninit(&gh);
} else {
gl->gl_owner_pid = current->pid;
gl->gl_ip = (unsigned long)__builtin_return_address(0);
- clear_bit(HIF_WAIT, &gh.gh_iflags);
- smp_mb();
- wake_up_bit(&gh.gh_iflags, HIF_WAIT);
+ spin_unlock(&gl->gl_spin);
}
- spin_unlock(&gl->gl_spin);
-
- wait_on_holder(&gh);
- gfs2_holder_uninit(&gh);
}
/**
@@ -691,7 +673,6 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
gl->gl_owner_pid = 0;
gl->gl_ip = 0;
run_queue(gl);
- BUG_ON(!spin_is_locked(&gl->gl_spin));
spin_unlock(&gl->gl_spin);
}
@@ -722,7 +703,10 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
}
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
- gl->gl_demote_state = LM_ST_UNLOCKED;
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+ gl->gl_waiters2 = 1;
+ else
+ gl->gl_demote_state = LM_ST_UNLOCKED;
}
spin_unlock(&gl->gl_spin);
}
@@ -943,8 +927,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned int ret;
- if (glops->go_drop_th)
- glops->go_drop_th(gl);
+ if (glops->go_xmote_th)
+ glops->go_xmote_th(gl);
gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -1156,8 +1140,6 @@ restart:
return -EIO;
}
- set_bit(HIF_PROMOTE, &gh->gh_iflags);
-
spin_lock(&gl->gl_spin);
add_to_queue(gh);
run_queue(gl);
@@ -1248,12 +1230,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
list_del_init(&gh->gh_list);
if (list_empty(&gl->gl_holders)) {
- spin_unlock(&gl->gl_spin);
-
- if (glops->go_unlock)
+ if (glops->go_unlock) {
+ spin_unlock(&gl->gl_spin);
glops->go_unlock(gh);
-
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_spin);
+ }
gl->gl_stamp = jiffies;
}
@@ -1910,8 +1891,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
- print_dbg(gi, " le = %s\n",
- (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
print_dbg(gi, " reclaim = %s\n",
(list_empty(&gl->gl_reclaim)) ? "no" : "yes");
if (gl->gl_aspace)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4670dcb2a87..c663b7a0f41 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,7 +56,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
bd = list_entry(head->next, struct gfs2_bufdata,
bd_ail_gl_list);
bh = bd->bd_bh;
- gfs2_remove_from_ail(NULL, bd);
+ gfs2_remove_from_ail(bd);
bd->bd_bh = NULL;
bh->b_private = NULL;
bd->bd_blkno = bh->b_blocknr;
@@ -86,15 +86,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
if (!ip || !S_ISREG(inode->i_mode))
return;
- if (!test_bit(GIF_PAGED, &ip->i_flags))
- return;
-
unmap_shared_mapping_range(inode->i_mapping, 0, 0);
-
if (test_bit(GIF_SW_PAGED, &ip->i_flags))
set_bit(GLF_DIRTY, &gl->gl_flags);
- clear_bit(GIF_SW_PAGED, &ip->i_flags);
}
/**
@@ -143,44 +138,34 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
static void inode_go_sync(struct gfs2_glock *gl)
{
struct gfs2_inode *ip = gl->gl_object;
+ struct address_space *metamapping = gl->gl_aspace->i_mapping;
+ int error;
+
+ if (gl->gl_state != LM_ST_UNLOCKED)
+ gfs2_pte_inval(gl);
+ if (gl->gl_state != LM_ST_EXCLUSIVE)
+ return;
if (ip && !S_ISREG(ip->i_inode.i_mode))
ip = NULL;
if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
- if (ip && !gfs2_is_jdata(ip))
- filemap_fdatawrite(ip->i_inode.i_mapping);
gfs2_log_flush(gl->gl_sbd, gl);
- if (ip && gfs2_is_jdata(ip))
- filemap_fdatawrite(ip->i_inode.i_mapping);
- gfs2_meta_sync(gl);
+ filemap_fdatawrite(metamapping);
if (ip) {
struct address_space *mapping = ip->i_inode.i_mapping;
- int error = filemap_fdatawait(mapping);
+ filemap_fdatawrite(mapping);
+ error = filemap_fdatawait(mapping);
mapping_set_error(mapping, error);
}
+ error = filemap_fdatawait(metamapping);
+ mapping_set_error(metamapping, error);
clear_bit(GLF_DIRTY, &gl->gl_flags);
gfs2_ail_empty_gl(gl);
}
}
/**
- * inode_go_xmote_th - promote/demote a glock
- * @gl: the glock
- * @state: the requested state
- * @flags:
- *
- */
-
-static void inode_go_xmote_th(struct gfs2_glock *gl)
-{
- if (gl->gl_state != LM_ST_UNLOCKED)
- gfs2_pte_inval(gl);
- if (gl->gl_state == LM_ST_EXCLUSIVE)
- inode_go_sync(gl);
-}
-
-/**
* inode_go_xmote_bh - After promoting/demoting a glock
* @gl: the glock
*
@@ -201,22 +186,6 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
}
/**
- * inode_go_drop_th - unlock a glock
- * @gl: the glock
- *
- * Invoked from rq_demote().
- * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
- * is being purged from our node's glock cache; we're dropping lock.
- */
-
-static void inode_go_drop_th(struct gfs2_glock *gl)
-{
- gfs2_pte_inval(gl);
- if (gl->gl_state == LM_ST_EXCLUSIVE)
- inode_go_sync(gl);
-}
-
-/**
* inode_go_inval - prepare a inode glock to be released
* @gl: the glock
* @flags:
@@ -234,10 +203,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
set_bit(GIF_INVALID, &ip->i_flags);
}
- if (ip && S_ISREG(ip->i_inode.i_mode)) {
+ if (ip && S_ISREG(ip->i_inode.i_mode))
truncate_inode_pages(ip->i_inode.i_mapping, 0);
- clear_bit(GIF_PAGED, &ip->i_flags);
- }
}
/**
@@ -294,23 +261,6 @@ static int inode_go_lock(struct gfs2_holder *gh)
}
/**
- * inode_go_unlock - operation done before an inode lock is unlocked by a
- * process
- * @gl: the glock
- * @flags:
- *
- */
-
-static void inode_go_unlock(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_inode *ip = gl->gl_object;
-
- if (ip)
- gfs2_meta_cache_flush(ip);
-}
-
-/**
* rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
* @gl: the glock
*
@@ -350,14 +300,14 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
}
/**
- * trans_go_xmote_th - promote/demote the transaction glock
+ * trans_go_sync - promote/demote the transaction glock
* @gl: the glock
* @state: the requested state
* @flags:
*
*/
-static void trans_go_xmote_th(struct gfs2_glock *gl)
+static void trans_go_sync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -384,7 +334,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
if (gl->gl_state != LM_ST_UNLOCKED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -402,24 +351,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
}
/**
- * trans_go_drop_th - unlock the transaction glock
- * @gl: the glock
- *
- * We want to sync the device even with localcaching. Remember
- * that localcaching journal replay only marks buffers dirty.
- */
-
-static void trans_go_drop_th(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- gfs2_meta_syncfs(sdp);
- gfs2_log_shutdown(sdp);
- }
-}
-
-/**
* quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
* @gl: the glock
*
@@ -433,25 +364,21 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
const struct gfs2_glock_operations gfs2_meta_glops = {
.go_xmote_th = meta_go_sync,
- .go_drop_th = meta_go_sync,
.go_type = LM_TYPE_META,
};
const struct gfs2_glock_operations gfs2_inode_glops = {
- .go_xmote_th = inode_go_xmote_th,
+ .go_xmote_th = inode_go_sync,
.go_xmote_bh = inode_go_xmote_bh,
- .go_drop_th = inode_go_drop_th,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_lock = inode_go_lock,
- .go_unlock = inode_go_unlock,
.go_type = LM_TYPE_INODE,
.go_min_hold_time = HZ / 10,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_xmote_th = meta_go_sync,
- .go_drop_th = meta_go_sync,
.go_inval = meta_go_inval,
.go_demote_ok = rgrp_go_demote_ok,
.go_lock = rgrp_go_lock,
@@ -461,9 +388,8 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
};
const struct gfs2_glock_operations gfs2_trans_glops = {
- .go_xmote_th = trans_go_xmote_th,
+ .go_xmote_th = trans_go_sync,
.go_xmote_bh = trans_go_xmote_bh,
- .go_drop_th = trans_go_drop_th,
.go_type = LM_TYPE_NONDISK,
};
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eaddfb5a8e6..513aaf0dc0a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -131,7 +131,6 @@ struct gfs2_bufdata {
struct gfs2_glock_operations {
void (*go_xmote_th) (struct gfs2_glock *gl);
void (*go_xmote_bh) (struct gfs2_glock *gl);
- void (*go_drop_th) (struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
@@ -141,10 +140,6 @@ struct gfs2_glock_operations {
};
enum {
- /* Actions */
- HIF_MUTEX = 0,
- HIF_PROMOTE = 1,
-
/* States */
HIF_HOLDER = 6,
HIF_FIRST = 7,
@@ -171,6 +166,8 @@ enum {
GLF_DEMOTE = 3,
GLF_PENDING_DEMOTE = 4,
GLF_DIRTY = 5,
+ GLF_DEMOTE_IN_PROGRESS = 6,
+ GLF_LFLUSH = 7,
};
struct gfs2_glock {
@@ -190,6 +187,7 @@ struct gfs2_glock {
struct list_head gl_holders;
struct list_head gl_waiters1; /* HIF_MUTEX */
struct list_head gl_waiters3; /* HIF_PROMOTE */
+ int gl_waiters2; /* GIF_DEMOTE */
const struct gfs2_glock_operations *gl_ops;
@@ -210,7 +208,6 @@ struct gfs2_glock {
struct gfs2_sbd *gl_sbd;
struct inode *gl_aspace;
- struct gfs2_log_element gl_le;
struct list_head gl_ail_list;
atomic_t gl_ail_count;
struct delayed_work gl_work;
@@ -239,7 +236,6 @@ struct gfs2_alloc {
enum {
GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
- GIF_PAGED = 2,
GIF_SW_PAGED = 3,
};
@@ -268,14 +264,10 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
- struct gfs2_alloc i_alloc;
+ struct gfs2_alloc *i_alloc;
u64 i_last_rg_alloc;
- spinlock_t i_spin;
struct rw_semaphore i_rw_mutex;
- unsigned long i_last_pfault;
-
- struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
};
/*
@@ -287,19 +279,12 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode)
return container_of(inode, struct gfs2_inode, i_inode);
}
-/* To be removed? */
-static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
+static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
{
return inode->i_sb->s_fs_info;
}
-enum {
- GFF_DID_DIRECT_ALLOC = 0,
- GFF_EXLOCK = 1,
-};
-
struct gfs2_file {
- unsigned long f_flags; /* GFF_... */
struct mutex f_fl_mutex;
struct gfs2_holder f_fl_gh;
};
@@ -373,8 +358,17 @@ struct gfs2_ail {
u64 ai_sync_gen;
};
+struct gfs2_journal_extent {
+ struct list_head extent_list;
+
+ unsigned int lblock; /* First logical block */
+ u64 dblock; /* First disk block */
+ u64 blocks;
+};
+
struct gfs2_jdesc {
struct list_head jd_list;
+ struct list_head extent_list;
struct inode *jd_inode;
unsigned int jd_jid;
@@ -421,13 +415,9 @@ struct gfs2_args {
struct gfs2_tune {
spinlock_t gt_spin;
- unsigned int gt_ilimit;
- unsigned int gt_ilimit_tries;
- unsigned int gt_ilimit_min;
unsigned int gt_demote_secs; /* Cache retention for unheld glock */
unsigned int gt_incore_log_blocks;
unsigned int gt_log_flush_secs;
- unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
unsigned int gt_recoverd_secs;
unsigned int gt_logd_secs;
@@ -443,10 +433,8 @@ struct gfs2_tune {
unsigned int gt_new_files_jdata;
unsigned int gt_new_files_directio;
unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
- unsigned int gt_lockdump_size;
unsigned int gt_stall_secs; /* Detects trouble! */
unsigned int gt_complain_secs;
- unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
unsigned int gt_statfs_quantum;
unsigned int gt_statfs_slow;
};
@@ -539,7 +527,6 @@ struct gfs2_sbd {
/* StatFS stuff */
spinlock_t sd_statfs_spin;
- struct mutex sd_statfs_mutex;
struct gfs2_statfs_change_host sd_statfs_master;
struct gfs2_statfs_change_host sd_statfs_local;
unsigned long sd_statfs_sync_time;
@@ -602,20 +589,18 @@ struct gfs2_sbd {
unsigned int sd_log_commited_databuf;
unsigned int sd_log_commited_revoke;
- unsigned int sd_log_num_gl;
unsigned int sd_log_num_buf;
unsigned int sd_log_num_revoke;
unsigned int sd_log_num_rg;
unsigned int sd_log_num_databuf;
- struct list_head sd_log_le_gl;
struct list_head sd_log_le_buf;
struct list_head sd_log_le_revoke;
struct list_head sd_log_le_rg;
struct list_head sd_log_le_databuf;
struct list_head sd_log_le_ordered;
- unsigned int sd_log_blks_free;
+ atomic_t sd_log_blks_free;
struct mutex sd_log_reserve_mutex;
u64 sd_log_sequence;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5f6dc32946c..728d3169e7b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -31,7 +31,6 @@
#include "log.h"
#include "meta_io.h"
#include "ops_address.h"
-#include "ops_file.h"
#include "ops_inode.h"
#include "quota.h"
#include "rgrp.h"
@@ -132,15 +131,21 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
void gfs2_set_iop(struct inode *inode)
{
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
umode_t mode = inode->i_mode;
if (S_ISREG(mode)) {
inode->i_op = &gfs2_file_iops;
- inode->i_fop = &gfs2_file_fops;
- inode->i_mapping->a_ops = &gfs2_file_aops;
+ if (sdp->sd_args.ar_localflocks)
+ inode->i_fop = &gfs2_file_fops_nolock;
+ else
+ inode->i_fop = &gfs2_file_fops;
} else if (S_ISDIR(mode)) {
inode->i_op = &gfs2_dir_iops;
- inode->i_fop = &gfs2_dir_fops;
+ if (sdp->sd_args.ar_localflocks)
+ inode->i_fop = &gfs2_dir_fops_nolock;
+ else
+ inode->i_fop = &gfs2_dir_fops;
} else if (S_ISLNK(mode)) {
inode->i_op = &gfs2_symlink_iops;
} else {
@@ -291,12 +296,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
di->di_entries = be32_to_cpu(str->di_entries);
di->di_eattr = be64_to_cpu(str->di_eattr);
- return 0;
-}
+ if (S_ISREG(ip->i_inode.i_mode))
+ gfs2_set_aops(&ip->i_inode);
-static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh)
-{
- ip->i_cache[0] = bh;
+ return 0;
}
/**
@@ -366,7 +369,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
if (error)
goto out_rg_gunlock;
- gfs2_trans_add_gl(ip->i_gl);
+ set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+ set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
gfs2_free_di(rgd, ip);
@@ -707,9 +711,10 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
int error;
- gfs2_alloc_get(dip);
+ if (gfs2_alloc_get(dip) == NULL)
+ return -ENOMEM;
- dip->i_alloc.al_requested = RES_DINODE;
+ dip->i_alloc->al_requested = RES_DINODE;
error = gfs2_inplace_reserve(dip);
if (error)
goto out;
@@ -855,7 +860,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
if (alloc_required < 0)
- goto fail;
+ goto fail_quota_locks;
if (alloc_required) {
error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
if (error)
@@ -896,7 +901,7 @@ fail_end_trans:
gfs2_trans_end(sdp);
fail_ipreserv:
- if (dip->i_alloc.al_rgd)
+ if (dip->i_alloc->al_rgd)
gfs2_inplace_release(dip);
fail_quota_locks:
@@ -966,7 +971,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
int error;
u64 generation;
- struct buffer_head *bh=NULL;
+ struct buffer_head *bh = NULL;
if (!name->len || name->len > GFS2_FNAMESIZE)
return ERR_PTR(-ENAMETOOLONG);
@@ -1003,8 +1008,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (IS_ERR(inode))
goto fail_gunlock2;
- gfs2_inode_bh(GFS2_I(inode), bh);
-
error = gfs2_inode_refresh(GFS2_I(inode));
if (error)
goto fail_gunlock2;
@@ -1021,6 +1024,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock2;
+ if (bh)
+ brelse(bh);
if (!inode)
return ERR_PTR(-ENOMEM);
return inode;
@@ -1032,6 +1037,8 @@ fail_gunlock2:
fail_gunlock:
gfs2_glock_dq(ghs);
fail:
+ if (bh)
+ brelse(bh);
return ERR_PTR(error);
}
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 351ac87ab38..d4465066261 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -20,6 +20,18 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
return ip->i_di.di_flags & GFS2_DIF_JDATA;
}
+static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+{
+ const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+}
+
+static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+{
+ const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+}
+
static inline int gfs2_is_dir(const struct gfs2_inode *ip)
{
return S_ISDIR(ip->i_inode.i_mode);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 41c5b04caab..f2efff42422 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -67,6 +67,11 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
memset(data, 0, 256);
strncpy(data, data_arg, 255);
+ if (!strlen(data)) {
+ log_error("no mount options, (u)mount helpers not installed");
+ return -EINVAL;
+ }
+
for (options = data; (x = strsep(&options, ":")); ) {
if (!*x)
continue;
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1f7b038530b..2ebd374b314 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -89,15 +89,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
op->info.number = name->ln_number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long) fl->fl_owner;
if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+ /* fl_owner is lockd which doesn't distinguish
+ processes on the nfs client */
+ op->info.owner = (__u64) fl->fl_pid;
xop->callback = fl->fl_lmops->fl_grant;
locks_init_lock(&xop->flc);
locks_copy_lock(&xop->flc, fl);
xop->fl = fl;
xop->file = file;
- } else
+ } else {
+ op->info.owner = (__u64)(long) fl->fl_owner;
xop->callback = NULL;
+ }
send_op(op);
@@ -203,7 +207,10 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
op->info.number = name->ln_number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long) fl->fl_owner;
+ if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+ op->info.owner = (__u64) fl->fl_pid;
+ else
+ op->info.owner = (__u64)(long) fl->fl_owner;
send_op(op);
wait_event(recv_wq, (op->done != 0));
@@ -242,7 +249,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
op->info.number = name->ln_number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- op->info.owner = (__u64)(long) fl->fl_owner;
+ if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+ op->info.owner = (__u64) fl->fl_pid;
+ else
+ op->info.owner = (__u64)(long) fl->fl_owner;
send_op(op);
wait_event(recv_wq, (op->done != 0));
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index ae9e6a25fe2..a87b0983976 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,51 +189,39 @@ static struct kobj_type gdlm_ktype = {
.sysfs_ops = &gdlm_attr_ops,
};
-static struct kset gdlm_kset = {
- .ktype = &gdlm_ktype,
-};
+static struct kset *gdlm_kset;
int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
{
int error;
- error = kobject_set_name(&ls->kobj, "%s", "lock_module");
- if (error) {
- log_error("can't set kobj name %d", error);
- return error;
- }
-
- ls->kobj.kset = &gdlm_kset;
- ls->kobj.ktype = &gdlm_ktype;
- ls->kobj.parent = fskobj;
-
- error = kobject_register(&ls->kobj);
+ ls->kobj.kset = gdlm_kset;
+ error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
+ "lock_module");
if (error)
log_error("can't register kobj %d", error);
+ kobject_uevent(&ls->kobj, KOBJ_ADD);
return error;
}
void gdlm_kobject_release(struct gdlm_ls *ls)
{
- kobject_unregister(&ls->kobj);
+ kobject_put(&ls->kobj);
}
int gdlm_sysfs_init(void)
{
- int error;
-
- kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
- kobj_set_kset_s(&gdlm_kset, kernel_subsys);
- error = kset_register(&gdlm_kset);
- if (error)
- printk("lock_dlm: cannot register kset %d\n", error);
-
- return error;
+ gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+ if (!gdlm_kset) {
+ printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
+ return -ENOMEM;
+ }
+ return 0;
}
void gdlm_sysfs_exit(void)
{
- kset_unregister(&gdlm_kset);
+ kset_unregister(gdlm_kset);
}
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index bd938f06481..521694fc19d 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -273,18 +273,13 @@ static int gdlm_thread(void *data, int blist)
struct gdlm_ls *ls = (struct gdlm_ls *) data;
struct gdlm_lock *lp = NULL;
uint8_t complete, blocking, submit, drop;
- DECLARE_WAITQUEUE(wait, current);
/* Only thread1 is allowed to do blocking callbacks since gfs
may wait for a completion callback within a blocking cb. */
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&ls->thread_wait, &wait);
- if (no_work(ls, blist))
- schedule();
- remove_wait_queue(&ls->thread_wait, &wait);
- set_current_state(TASK_RUNNING);
+ wait_event_interruptible(ls->thread_wait,
+ !no_work(ls, blist) || kthread_should_stop());
complete = blocking = submit = drop = 0;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 7df70247325..161ab6f2058 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -16,6 +16,8 @@
#include <linux/crc32.h>
#include <linux/lm_interface.h>
#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "gfs2.h"
#include "incore.h"
@@ -68,14 +70,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
*
*/
-void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
{
bd->bd_ail = NULL;
list_del_init(&bd->bd_ail_st_list);
list_del_init(&bd->bd_ail_gl_list);
atomic_dec(&bd->bd_gl->gl_ail_count);
- if (mapping)
- gfs2_meta_cache_flush(GFS2_I(mapping->host));
brelse(bd->bd_bh);
}
@@ -92,8 +92,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
struct buffer_head *bh;
int retry;
- BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
-
do {
retry = 0;
@@ -210,7 +208,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
gfs2_log_unlock(sdp);
}
-int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
{
struct gfs2_ail *ai, *s;
int ret;
@@ -248,7 +246,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
bd = list_entry(head->prev, struct gfs2_bufdata,
bd_ail_st_list);
gfs2_assert(sdp, bd->bd_ail == ai);
- gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd);
+ gfs2_remove_from_ail(bd);
}
}
@@ -303,7 +301,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
mutex_lock(&sdp->sd_log_reserve_mutex);
gfs2_log_lock(sdp);
- while(sdp->sd_log_blks_free <= (blks + reserved_blks)) {
+ while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
gfs2_log_unlock(sdp);
gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL);
@@ -312,7 +310,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
gfs2_ail1_start(sdp, 0);
gfs2_log_lock(sdp);
}
- sdp->sd_log_blks_free -= blks;
+ atomic_sub(blks, &sdp->sd_log_blks_free);
gfs2_log_unlock(sdp);
mutex_unlock(&sdp->sd_log_reserve_mutex);
@@ -332,27 +330,23 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
{
gfs2_log_lock(sdp);
- sdp->sd_log_blks_free += blks;
+ atomic_add(blks, &sdp->sd_log_blks_free);
gfs2_assert_withdraw(sdp,
- sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+ atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
gfs2_log_unlock(sdp);
up_read(&sdp->sd_log_flush_lock);
}
static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
{
- struct inode *inode = sdp->sd_jdesc->jd_inode;
- int error;
- struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-
- bh_map.b_size = 1 << inode->i_blkbits;
- error = gfs2_block_map(inode, lbn, 0, &bh_map);
- if (error || !bh_map.b_blocknr)
- printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error,
- (unsigned long long)bh_map.b_blocknr, lbn);
- gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
-
- return bh_map.b_blocknr;
+ struct gfs2_journal_extent *je;
+
+ list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
+ if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
+ return je->dblock + lbn - je->lblock;
+ }
+
+ return -1;
}
/**
@@ -561,8 +555,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
ail2_empty(sdp, new_tail);
gfs2_log_lock(sdp);
- sdp->sd_log_blks_free += dist;
- gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+ atomic_add(dist, &sdp->sd_log_blks_free);
+ gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
gfs2_log_unlock(sdp);
sdp->sd_log_tail = new_tail;
@@ -652,7 +646,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
get_bh(bh);
gfs2_log_unlock(sdp);
lock_buffer(bh);
- if (test_clear_buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
} else {
@@ -694,20 +688,16 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
*
*/
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
{
struct gfs2_ail *ai;
down_write(&sdp->sd_log_flush_lock);
- if (gl) {
- gfs2_log_lock(sdp);
- if (list_empty(&gl->gl_le.le_list)) {
- gfs2_log_unlock(sdp);
- up_write(&sdp->sd_log_flush_lock);
- return;
- }
- gfs2_log_unlock(sdp);
+ /* Log might have been flushed while we waited for the flush lock */
+ if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
+ up_write(&sdp->sd_log_flush_lock);
+ return;
}
ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
@@ -739,7 +729,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
log_flush_commit(sdp);
else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
gfs2_log_lock(sdp);
- sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */
+ atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
gfs2_log_unlock(sdp);
log_write_header(sdp, 0, PULL);
}
@@ -767,7 +757,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
unsigned int reserved;
- unsigned int old;
+ unsigned int unused;
gfs2_log_lock(sdp);
@@ -779,14 +769,11 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
reserved = calc_reserved(sdp);
- old = sdp->sd_log_blks_free;
- sdp->sd_log_blks_free += tr->tr_reserved -
- (reserved - sdp->sd_log_blks_reserved);
-
- gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
- gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <=
+ unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
+ gfs2_assert_withdraw(sdp, unused >= 0);
+ atomic_add(unused, &sdp->sd_log_blks_free);
+ gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
sdp->sd_jdesc->jd_blocks);
-
sdp->sd_log_blks_reserved = reserved;
gfs2_log_unlock(sdp);
@@ -825,7 +812,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
down_write(&sdp->sd_log_flush_lock);
gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
- gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
@@ -838,7 +824,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
(sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
- gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+ gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
@@ -866,3 +852,42 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
}
}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+ int need_flush;
+
+ while (!kthread_should_stop()) {
+ /* Advance the log tail */
+
+ t = sdp->sd_log_flush_time +
+ gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+
+ gfs2_ail1_empty(sdp, DIO_ALL);
+ gfs2_log_lock(sdp);
+ need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+ gfs2_log_unlock(sdp);
+ if (need_flush || time_after_eq(jiffies, t)) {
+ gfs2_log_flush(sdp, NULL);
+ sdp->sd_log_flush_time = jiffies;
+ }
+
+ t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+ if (freezing(current))
+ refrigerator();
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index dae28240062..77115281650 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,8 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
unsigned int ssize);
-int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
-
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
void gfs2_log_incr_head(struct gfs2_sbd *sdp);
@@ -57,11 +55,19 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp);
struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
struct buffer_head *real);
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+
+static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+{
+ if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+ __gfs2_log_flush(sbd, gl);
+}
+
void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd);
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
void gfs2_log_shutdown(struct gfs2_sbd *sdp);
void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+int gfs2_logd(void *data);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c27cea761c..fae59d69d01 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -87,6 +87,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
}
bd->bd_ail = ai;
list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
}
@@ -124,49 +125,6 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
return bh;
}
-static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
- struct gfs2_glock *gl;
- struct gfs2_trans *tr = current->journal_info;
-
- tr->tr_touched = 1;
-
- gl = container_of(le, struct gfs2_glock, gl_le);
- if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
- return;
-
- if (!list_empty(&le->le_list))
- return;
-
- gfs2_glock_hold(gl);
- set_bit(GLF_DIRTY, &gl->gl_flags);
- sdp->sd_log_num_gl++;
- list_add(&le->le_list, &sdp->sd_log_le_gl);
-}
-
-static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
- gfs2_log_lock(sdp);
- __glock_lo_add(sdp, le);
- gfs2_log_unlock(sdp);
-}
-
-static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
- struct list_head *head = &sdp->sd_log_le_gl;
- struct gfs2_glock *gl;
-
- while (!list_empty(head)) {
- gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
- list_del_init(&gl->gl_le.le_list);
- sdp->sd_log_num_gl--;
-
- gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
- gfs2_glock_put(gl);
- }
- gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
-}
-
static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
{
struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
@@ -182,7 +140,8 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
list_add(&bd->bd_list_tr, &tr->tr_list_buf);
if (!list_empty(&le->le_list))
goto out;
- __glock_lo_add(sdp, &bd->bd_gl->gl_le);
+ set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+ set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
gfs2_meta_check(sdp, bd->bd_bh);
gfs2_pin(sdp, bd->bd_bh);
sdp->sd_log_num_buf++;
@@ -556,17 +515,20 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
lock_buffer(bd->bd_bh);
gfs2_log_lock(sdp);
- if (!list_empty(&bd->bd_list_tr))
- goto out;
- tr->tr_touched = 1;
- if (gfs2_is_jdata(ip)) {
- tr->tr_num_buf++;
- list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+ if (tr) {
+ if (!list_empty(&bd->bd_list_tr))
+ goto out;
+ tr->tr_touched = 1;
+ if (gfs2_is_jdata(ip)) {
+ tr->tr_num_buf++;
+ list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+ }
}
if (!list_empty(&le->le_list))
goto out;
- __glock_lo_add(sdp, &bd->bd_gl->gl_le);
+ set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+ set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
if (gfs2_is_jdata(ip)) {
gfs2_pin(sdp, bd->bd_bh);
tr->tr_num_databuf_new++;
@@ -773,12 +735,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
}
-const struct gfs2_log_operations gfs2_glock_lops = {
- .lo_add = glock_lo_add,
- .lo_after_commit = glock_lo_after_commit,
- .lo_name = "glock",
-};
-
const struct gfs2_log_operations gfs2_buf_lops = {
.lo_add = buf_lo_add,
.lo_incore_commit = buf_lo_incore_commit,
@@ -816,7 +772,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
};
const struct gfs2_log_operations *gfs2_log_ops[] = {
- &gfs2_glock_lops,
&gfs2_databuf_lops,
&gfs2_buf_lops,
&gfs2_rg_lops,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7ecfe0d3a49..9c7765c12d6 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -29,9 +29,8 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
struct gfs2_inode *ip = foo;
inode_init_once(&ip->i_inode);
- spin_lock_init(&ip->i_spin);
init_rwsem(&ip->i_rw_mutex);
- memset(ip->i_cache, 0, sizeof(ip->i_cache));
+ ip->i_alloc = NULL;
}
static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4da423985e4..85aea27b4a8 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -50,6 +50,7 @@ static int gfs2_aspace_writepage(struct page *page,
static const struct address_space_operations aspace_aops = {
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
+ .sync_page = block_sync_page,
};
/**
@@ -221,13 +222,14 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head **bhp)
{
*bhp = getbuf(gl, blkno, CREATE);
- if (!buffer_uptodate(*bhp))
+ if (!buffer_uptodate(*bhp)) {
ll_rw_block(READ_META, 1, bhp);
- if (flags & DIO_WAIT) {
- int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
- if (error) {
- brelse(*bhp);
- return error;
+ if (flags & DIO_WAIT) {
+ int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+ if (error) {
+ brelse(*bhp);
+ return error;
+ }
}
}
@@ -282,7 +284,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
return;
}
- bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+ bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
bd->bd_bh = bh;
bd->bd_gl = gl;
@@ -317,7 +319,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
}
if (bd) {
if (bd->bd_ail) {
- gfs2_remove_from_ail(NULL, bd);
+ gfs2_remove_from_ail(bd);
bh->b_private = NULL;
bd->bd_bh = NULL;
bd->bd_blkno = bh->b_blocknr;
@@ -358,32 +360,6 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
}
/**
- * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
- * @ip: The GFS2 inode
- *
- * This releases buffers that are in the most-recently-used array of
- * blocks used for indirect block addressing for this inode.
- */
-
-void gfs2_meta_cache_flush(struct gfs2_inode *ip)
-{
- struct buffer_head **bh_slot;
- unsigned int x;
-
- spin_lock(&ip->i_spin);
-
- for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
- bh_slot = &ip->i_cache[x];
- if (*bh_slot) {
- brelse(*bh_slot);
- *bh_slot = NULL;
- }
- }
-
- spin_unlock(&ip->i_spin);
-}
-
-/**
* gfs2_meta_indirect_buffer - Get a metadata buffer
* @ip: The GFS2 inode
* @height: The level of this buf in the metadata (indir addr) tree (if any)
@@ -391,8 +367,6 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
* @new: Non-zero if we may create a new buffer
* @bhp: the buffer is returned here
*
- * Try to use the gfs2_inode's MRU metadata tree cache.
- *
* Returns: errno
*/
@@ -401,58 +375,25 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_glock *gl = ip->i_gl;
- struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
- int in_cache = 0;
-
- BUG_ON(!gl);
- BUG_ON(!sdp);
-
- spin_lock(&ip->i_spin);
- if (*bh_slot && (*bh_slot)->b_blocknr == num) {
- bh = *bh_slot;
- get_bh(bh);
- in_cache = 1;
- }
- spin_unlock(&ip->i_spin);
-
- if (!bh)
- bh = getbuf(gl, num, CREATE);
-
- if (!bh)
- return -ENOBUFS;
+ struct buffer_head *bh;
+ int ret = 0;
if (new) {
- if (gfs2_assert_warn(sdp, height))
- goto err;
- meta_prep_new(bh);
+ BUG_ON(height == 0);
+ bh = gfs2_meta_new(gl, num);
gfs2_trans_add_bh(ip->i_gl, bh, 1);
gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
} else {
u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
- if (!buffer_uptodate(bh)) {
- ll_rw_block(READ_META, 1, &bh);
- if (gfs2_meta_wait(sdp, bh))
- goto err;
+ ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+ if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
+ brelse(bh);
+ ret = -EIO;
}
- if (gfs2_metatype_check(sdp, bh, mtype))
- goto err;
- }
-
- if (!in_cache) {
- spin_lock(&ip->i_spin);
- if (*bh_slot)
- brelse(*bh_slot);
- *bh_slot = bh;
- get_bh(bh);
- spin_unlock(&ip->i_spin);
}
-
*bhp = bh;
- return 0;
-err:
- brelse(bh);
- return -EIO;
+ return ret;
}
/**
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b7048222ebb..73e3b1c76fe 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,7 +56,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_meta_cache_flush(struct gfs2_inode *ip);
int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
int new, struct buffer_head **bhp);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 9679f8b9870..38dbe99a30e 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -20,6 +20,8 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
#include "gfs2.h"
#include "incore.h"
@@ -32,7 +34,6 @@
#include "quota.h"
#include "trans.h"
#include "rgrp.h"
-#include "ops_file.h"
#include "super.h"
#include "util.h"
#include "glops.h"
@@ -58,22 +59,6 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
}
/**
- * gfs2_get_block - Fills in a buffer head with details about a block
- * @inode: The inode
- * @lblock: The block number to look up
- * @bh_result: The buffer head to return the result in
- * @create: Non-zero if we may add block to the file
- *
- * Returns: errno
- */
-
-int gfs2_get_block(struct inode *inode, sector_t lblock,
- struct buffer_head *bh_result, int create)
-{
- return gfs2_block_map(inode, lblock, create, bh_result);
-}
-
-/**
* gfs2_get_block_noalloc - Fills in a buffer head with details about a block
* @inode: The inode
* @lblock: The block number to look up
@@ -88,7 +73,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
{
int error;
- error = gfs2_block_map(inode, lblock, 0, bh_result);
+ error = gfs2_block_map(inode, lblock, bh_result, 0);
if (error)
return error;
if (!buffer_mapped(bh_result))
@@ -99,20 +84,19 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
struct buffer_head *bh_result, int create)
{
- return gfs2_block_map(inode, lblock, 0, bh_result);
+ return gfs2_block_map(inode, lblock, bh_result, 0);
}
/**
- * gfs2_writepage - Write complete page
- * @page: Page to write
+ * gfs2_writepage_common - Common bits of writepage
+ * @page: The page to be written
+ * @wbc: The writeback control
*
- * Returns: errno
- *
- * Some of this is copied from block_write_full_page() although we still
- * call it to do most of the work.
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
*/
-static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int gfs2_writepage_common(struct page *page,
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
@@ -120,41 +104,133 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
- int error;
- int done_trans = 0;
+ int ret = -EIO;
- if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
- unlock_page(page);
- return -EIO;
- }
+ if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+ goto out;
+ ret = 0;
if (current->journal_info)
- goto out_ignore;
-
+ goto redirty;
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_CACHE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
page->mapping->a_ops->invalidatepage(page, 0);
- unlock_page(page);
- return 0; /* don't care */
+ goto out;
+ }
+ return 1;
+redirty:
+ redirty_page_for_writepage(wbc, page);
+out:
+ unlock_page(page);
+ return 0;
+}
+
+/**
+ * gfs2_writeback_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_writeback_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret;
+
+ ret = gfs2_writepage_common(page, wbc);
+ if (ret <= 0)
+ return ret;
+
+ ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+ if (ret == -EAGAIN)
+ ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+ return ret;
+}
+
+/**
+ * gfs2_ordered_writepage - Write page for ordered data files
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_ordered_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int ret;
+
+ ret = gfs2_writepage_common(page, wbc);
+ if (ret <= 0)
+ return ret;
+
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
}
+ gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
+ return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
- if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) &&
- PageChecked(page)) {
+/**
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+ if (PageChecked(page)) {
ClearPageChecked(page);
- error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
- if (error)
- goto out_ignore;
if (!page_has_buffers(page)) {
create_empty_buffers(page, inode->i_sb->s_blocksize,
(1 << BH_Dirty)|(1 << BH_Uptodate));
}
gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+ }
+ return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ int error;
+ int done_trans = 0;
+
+ error = gfs2_writepage_common(page, wbc);
+ if (error <= 0)
+ return error;
+
+ if (PageChecked(page)) {
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ goto out_ignore;
+ error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+ if (error)
+ goto out_ignore;
done_trans = 1;
}
- error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+ error = __gfs2_jdata_writepage(page, wbc);
if (done_trans)
gfs2_trans_end(sdp);
- gfs2_meta_cache_flush(ip);
return error;
out_ignore:
@@ -164,29 +240,190 @@ out_ignore:
}
/**
- * gfs2_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
* @mapping: The mapping to write
* @wbc: Write-back control
*
- * For journaled files and/or ordered writes this just falls back to the
- * kernel's default writepages path for now. We will probably want to change
- * that eventually (i.e. when we look at allocate on flush).
- *
- * For the data=writeback case though we can already ignore buffer heads
+ * For the data=writeback case we can already ignore buffer heads
* and write whole extents at once. This is a big reduction in the
* number of I/O requests we send and the bmap calls we make in this case.
*/
-static int gfs2_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int gfs2_writeback_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+}
+
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct pagevec *pvec,
+ int nr_pages, pgoff_t end)
{
struct inode *inode = mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
+ unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int i;
+ int ret;
+
+ ret = gfs2_trans_begin(sdp, nrblocks, 0);
+ if (ret < 0)
+ return ret;
+
+ for(i = 0; i < nr_pages; i++) {
+ struct page *page = pvec->pages[i];
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!wbc->range_cyclic && page->index > end) {
+ ret = 1;
+ unlock_page(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) ||
+ !clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ /* Is the page fully outside i_size? (truncate in progress) */
+ if (page->index > end_index || (page->index == end_index && !offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ continue;
+ }
+
+ ret = __gfs2_jdata_writepage(page, wbc);
+
+ if (ret || (--(wbc->nr_to_write) <= 0))
+ ret = 1;
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ ret = 1;
+ }
+
+ }
+ gfs2_trans_end(sdp);
+ return ret;
+}
+
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end;
+ int scanned = 0;
+ int range_whole = 0;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ return 0;
+ }
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ scanned = 1;
+ }
- if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip))
- return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+retry:
+ while (!done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ scanned = 1;
+ ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+ if (ret)
+ done = 1;
+ if (ret > 0)
+ ret = 0;
+
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (!scanned && !done) {
+ /*
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ scanned = 1;
+ index = 0;
+ goto retry;
+ }
+
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = index;
+ return ret;
+}
+
+
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ *
+ */
- return generic_writepages(mapping, wbc);
+static int gfs2_jdata_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+ int ret;
+
+ ret = gfs2_write_cache_jdata(mapping, wbc);
+ if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+ gfs2_log_flush(sdp, ip->i_gl);
+ ret = gfs2_write_cache_jdata(mapping, wbc);
+ }
+ return ret;
}
/**
@@ -231,62 +468,107 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
/**
- * gfs2_readpage - readpage with locking
- * @file: The file to read a page for. N.B. This may be NULL if we are
- * reading an internal file.
+ * __gfs2_readpage - readpage
+ * @file: The file to read a page for
* @page: The page to read
*
- * Returns: errno
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
*/
-static int gfs2_readpage(struct file *file, struct page *page)
+static int __gfs2_readpage(void *file, struct page *page)
{
struct gfs2_inode *ip = GFS2_I(page->mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
- struct gfs2_file *gf = NULL;
- struct gfs2_holder gh;
int error;
- int do_unlock = 0;
-
- if (likely(file != &gfs2_internal_file_sentinel)) {
- if (file) {
- gf = file->private_data;
- if (test_bit(GFF_EXLOCK, &gf->f_flags))
- /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
- goto skip_lock;
- }
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
- do_unlock = 1;
- error = gfs2_glock_nq_atime(&gh);
- if (unlikely(error))
- goto out_unlock;
- }
-skip_lock:
if (gfs2_is_stuffed(ip)) {
error = stuffed_readpage(ip, page);
unlock_page(page);
- } else
- error = mpage_readpage(page, gfs2_get_block);
+ } else {
+ error = mpage_readpage(page, gfs2_block_map);
+ }
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- error = -EIO;
+ return -EIO;
+
+ return error;
+}
+
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. We use a trylock in order to
+ * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
+ * in the event that we are unable to get the lock.
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+ struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+ struct gfs2_holder gh;
+ int error;
- if (do_unlock) {
- gfs2_glock_dq_m(1, &gh);
- gfs2_holder_uninit(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
+ error = gfs2_glock_nq_atime(&gh);
+ if (unlikely(error)) {
+ unlock_page(page);
+ goto out;
}
+ error = __gfs2_readpage(file, page);
+ gfs2_glock_dq(&gh);
out:
- return error;
-out_unlock:
- unlock_page(page);
+ gfs2_holder_uninit(&gh);
if (error == GLR_TRYFAILED) {
- error = AOP_TRUNCATED_PAGE;
yield();
+ return AOP_TRUNCATED_PAGE;
}
- if (do_unlock)
- gfs2_holder_uninit(&gh);
- goto out;
+ return error;
+}
+
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @ra_state: The readahead state (or NULL for no readahead)
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+ char *buf, loff_t *pos, unsigned size)
+{
+ struct address_space *mapping = ip->i_inode.i_mapping;
+ unsigned long index = *pos / PAGE_CACHE_SIZE;
+ unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+ unsigned copied = 0;
+ unsigned amt;
+ struct page *page;
+ void *p;
+
+ do {
+ amt = size - copied;
+ if (offset + size > PAGE_CACHE_SIZE)
+ amt = PAGE_CACHE_SIZE - offset;
+ page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ p = kmap_atomic(page, KM_USER0);
+ memcpy(buf + copied, p + offset, amt);
+ kunmap_atomic(p, KM_USER0);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ copied += amt;
+ index++;
+ offset = 0;
+ } while(copied < size);
+ (*pos) += size;
+ return size;
}
/**
@@ -300,10 +582,9 @@ out_unlock:
* Any I/O we ignore at this time will be done via readpage later.
* 2. We don't handle stuffed files here we let readpage do the honours.
* 3. mpage_readpages() does most of the heavy lifting in the common case.
- * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
- * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
- * well as read-ahead.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
*/
+
static int gfs2_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
@@ -311,42 +592,20 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_holder gh;
- int ret = 0;
- int do_unlock = 0;
+ int ret;
- if (likely(file != &gfs2_internal_file_sentinel)) {
- if (file) {
- struct gfs2_file *gf = file->private_data;
- if (test_bit(GFF_EXLOCK, &gf->f_flags))
- goto skip_lock;
- }
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
- LM_FLAG_TRY_1CB|GL_ATIME, &gh);
- do_unlock = 1;
- ret = gfs2_glock_nq_atime(&gh);
- if (ret == GLR_TRYFAILED)
- goto out_noerror;
- if (unlikely(ret))
- goto out_unlock;
- }
-skip_lock:
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+ ret = gfs2_glock_nq_atime(&gh);
+ if (unlikely(ret))
+ goto out_uninit;
if (!gfs2_is_stuffed(ip))
- ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
-
- if (do_unlock) {
- gfs2_glock_dq_m(1, &gh);
- gfs2_holder_uninit(&gh);
- }
-out:
+ ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+ gfs2_glock_dq(&gh);
+out_uninit:
+ gfs2_holder_uninit(&gh);
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
ret = -EIO;
return ret;
-out_noerror:
- ret = 0;
-out_unlock:
- if (do_unlock)
- gfs2_holder_uninit(&gh);
- goto out;
}
/**
@@ -382,20 +641,11 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(error))
goto out_uninit;
- error = -ENOMEM;
- page = __grab_cache_page(mapping, index);
- *pagep = page;
- if (!page)
- goto out_unlock;
-
gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
-
error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
if (error)
- goto out_putpage;
-
+ goto out_unlock;
- ip->i_alloc.al_requested = 0;
if (alloc_required) {
al = gfs2_alloc_get(ip);
@@ -424,40 +674,47 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (error)
goto out_trans_fail;
+ error = -ENOMEM;
+ page = __grab_cache_page(mapping, index);
+ *pagep = page;
+ if (unlikely(!page))
+ goto out_endtrans;
+
if (gfs2_is_stuffed(ip)) {
+ error = 0;
if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
error = gfs2_unstuff_dinode(ip, page);
if (error == 0)
goto prepare_write;
- } else if (!PageUptodate(page))
+ } else if (!PageUptodate(page)) {
error = stuffed_readpage(ip, page);
+ }
goto out;
}
prepare_write:
- error = block_prepare_write(page, from, to, gfs2_get_block);
-
+ error = block_prepare_write(page, from, to, gfs2_block_map);
out:
- if (error) {
- gfs2_trans_end(sdp);
+ if (error == 0)
+ return 0;
+
+ page_cache_release(page);
+ if (pos + len > ip->i_inode.i_size)
+ vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+out_endtrans:
+ gfs2_trans_end(sdp);
out_trans_fail:
- if (alloc_required) {
- gfs2_inplace_release(ip);
+ if (alloc_required) {
+ gfs2_inplace_release(ip);
out_qunlock:
- gfs2_quota_unlock(ip);
+ gfs2_quota_unlock(ip);
out_alloc_put:
- gfs2_alloc_put(ip);
- }
-out_putpage:
- page_cache_release(page);
- if (pos + len > ip->i_inode.i_size)
- vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+ gfs2_alloc_put(ip);
+ }
out_unlock:
- gfs2_glock_dq_m(1, &ip->i_gh);
+ gfs2_glock_dq(&ip->i_gh);
out_uninit:
- gfs2_holder_uninit(&ip->i_gh);
- }
-
+ gfs2_holder_uninit(&ip->i_gh);
return error;
}
@@ -565,7 +822,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *dibh;
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_dinode *di;
unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
unsigned int to = from + len;
@@ -585,19 +842,16 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
if (gfs2_is_stuffed(ip))
return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
- if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ if (!gfs2_is_writeback(ip))
gfs2_page_add_databufs(ip, page, from, to);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
- if (likely(ret >= 0)) {
- copied = ret;
- if ((pos + copied) > inode->i_size) {
- di = (struct gfs2_dinode *)dibh->b_data;
- ip->i_di.di_size = inode->i_size;
- di->di_size = cpu_to_be64(inode->i_size);
- mark_inode_dirty(inode);
- }
+ if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+ di = (struct gfs2_dinode *)dibh->b_data;
+ ip->i_di.di_size = inode->i_size;
+ di->di_size = cpu_to_be64(inode->i_size);
+ mark_inode_dirty(inode);
}
if (inode == sdp->sd_rindex)
@@ -606,7 +860,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
brelse(dibh);
gfs2_trans_end(sdp);
failed:
- if (al->al_requested) {
+ if (al) {
gfs2_inplace_release(ip);
gfs2_quota_unlock(ip);
gfs2_alloc_put(ip);
@@ -625,11 +879,7 @@ failed:
static int gfs2_set_page_dirty(struct page *page)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
- struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-
- if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
- SetPageChecked(page);
+ SetPageChecked(page);
return __set_page_dirty_buffers(page);
}
@@ -653,7 +903,7 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
return 0;
if (!gfs2_is_stuffed(ip))
- dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+ dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
gfs2_glock_dq_uninit(&i_gh);
@@ -719,13 +969,9 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
{
/*
* Should we return an error here? I can't see that O_DIRECT for
- * a journaled file makes any sense. For now we'll silently fall
- * back to buffered I/O, likewise we do the same for stuffed
- * files since they are (a) small and (b) unaligned.
+ * a stuffed file makes any sense. For now we'll silently fall
+ * back to buffered I/O
*/
- if (gfs2_is_jdata(ip))
- return 0;
-
if (gfs2_is_stuffed(ip))
return 0;
@@ -836,9 +1082,23 @@ cannot_release:
return 0;
}
-const struct address_space_operations gfs2_file_aops = {
- .writepage = gfs2_writepage,
- .writepages = gfs2_writepages,
+static const struct address_space_operations gfs2_writeback_aops = {
+ .writepage = gfs2_writeback_writepage,
+ .writepages = gfs2_writeback_writepages,
+ .readpage = gfs2_readpage,
+ .readpages = gfs2_readpages,
+ .sync_page = block_sync_page,
+ .write_begin = gfs2_write_begin,
+ .write_end = gfs2_write_end,
+ .bmap = gfs2_bmap,
+ .invalidatepage = gfs2_invalidatepage,
+ .releasepage = gfs2_releasepage,
+ .direct_IO = gfs2_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
+static const struct address_space_operations gfs2_ordered_aops = {
+ .writepage = gfs2_ordered_writepage,
.readpage = gfs2_readpage,
.readpages = gfs2_readpages,
.sync_page = block_sync_page,
@@ -849,5 +1109,34 @@ const struct address_space_operations gfs2_file_aops = {
.invalidatepage = gfs2_invalidatepage,
.releasepage = gfs2_releasepage,
.direct_IO = gfs2_direct_IO,
+ .migratepage = buffer_migrate_page,
};
+static const struct address_space_operations gfs2_jdata_aops = {
+ .writepage = gfs2_jdata_writepage,
+ .writepages = gfs2_jdata_writepages,
+ .readpage = gfs2_readpage,
+ .readpages = gfs2_readpages,
+ .sync_page = block_sync_page,
+ .write_begin = gfs2_write_begin,
+ .write_end = gfs2_write_end,
+ .set_page_dirty = gfs2_set_page_dirty,
+ .bmap = gfs2_bmap,
+ .invalidatepage = gfs2_invalidatepage,
+ .releasepage = gfs2_releasepage,
+};
+
+void gfs2_set_aops(struct inode *inode)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+
+ if (gfs2_is_writeback(ip))
+ inode->i_mapping->a_ops = &gfs2_writeback_aops;
+ else if (gfs2_is_ordered(ip))
+ inode->i_mapping->a_ops = &gfs2_ordered_aops;
+ else if (gfs2_is_jdata(ip))
+ inode->i_mapping->a_ops = &gfs2_jdata_aops;
+ else
+ BUG();
+}
+
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index fa1b5b3d28b..5da21285bba 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -14,9 +14,10 @@
#include <linux/buffer_head.h>
#include <linux/mm.h>
-extern const struct address_space_operations gfs2_file_aops;
-extern int gfs2_get_block(struct inode *inode, sector_t lblock,
- struct buffer_head *bh_result, int create);
extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+ struct file_ra_state *ra_state,
+ char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bb11fd6752d..f4842f2548c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -33,57 +33,12 @@
#include "lm.h"
#include "log.h"
#include "meta_io.h"
-#include "ops_file.h"
-#include "ops_vm.h"
#include "quota.h"
#include "rgrp.h"
#include "trans.h"
#include "util.h"
#include "eaops.h"
-
-/*
- * Most fields left uninitialised to catch anybody who tries to
- * use them. f_flags set to prevent file_accessed() from touching
- * any other part of this. Its use is purely as a flag so that we
- * know (in readpage()) whether or not do to locking.
- */
-struct file gfs2_internal_file_sentinel = {
- .f_flags = O_NOATIME|O_RDONLY,
-};
-
-static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
- unsigned long offset, unsigned long size)
-{
- char *kaddr;
- unsigned long count = desc->count;
-
- if (size > count)
- size = count;
-
- kaddr = kmap(page);
- memcpy(desc->arg.data, kaddr + offset, size);
- kunmap(page);
-
- desc->count = count - size;
- desc->written += size;
- desc->arg.buf += size;
- return size;
-}
-
-int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
- char *buf, loff_t *pos, unsigned size)
-{
- struct inode *inode = &ip->i_inode;
- read_descriptor_t desc;
- desc.written = 0;
- desc.arg.data = buf;
- desc.count = size;
- desc.error = 0;
- do_generic_mapping_read(inode->i_mapping, ra_state,
- &gfs2_internal_file_sentinel, pos, &desc,
- gfs2_read_actor);
- return desc.written ? desc.written : desc.error;
-}
+#include "ops_address.h"
/**
* gfs2_llseek - seek to a location in a file
@@ -214,7 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
if (put_user(fsflags, ptr))
error = -EFAULT;
- gfs2_glock_dq_m(1, &gh);
+ gfs2_glock_dq(&gh);
gfs2_holder_uninit(&gh);
return error;
}
@@ -291,7 +246,16 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
if (error)
goto out;
}
-
+ if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+ if (flags & GFS2_DIF_JDATA)
+ gfs2_log_flush(sdp, ip->i_gl);
+ error = filemap_fdatawrite(inode->i_mapping);
+ if (error)
+ goto out;
+ error = filemap_fdatawait(inode->i_mapping);
+ if (error)
+ goto out;
+ }
error = gfs2_trans_begin(sdp, RES_DINODE, 0);
if (error)
goto out;
@@ -303,6 +267,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
gfs2_set_inode_flags(inode);
+ gfs2_set_aops(inode);
out_trans_end:
gfs2_trans_end(sdp);
out:
@@ -338,6 +303,128 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -ENOTTY;
}
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+
+static int gfs2_allocate_page_backing(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct buffer_head bh;
+ unsigned long size = PAGE_CACHE_SIZE;
+ u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ do {
+ bh.b_state = 0;
+ bh.b_size = size;
+ gfs2_block_map(inode, lblock, &bh, 1);
+ if (!buffer_mapped(&bh))
+ return -EIO;
+ size -= bh.b_size;
+ lblock += (bh.b_size >> inode->i_blkbits);
+ } while(size > 0);
+ return 0;
+}
+
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @page: The page which is about to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ unsigned long last_index;
+ u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+ unsigned int data_blocks, ind_blocks, rblocks;
+ int alloc_required = 0;
+ struct gfs2_holder gh;
+ struct gfs2_alloc *al;
+ int ret;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
+ ret = gfs2_glock_nq_atime(&gh);
+ if (ret)
+ goto out;
+
+ set_bit(GIF_SW_PAGED, &ip->i_flags);
+ gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+ ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+ if (ret || !alloc_required)
+ goto out_unlock;
+ ret = -ENOMEM;
+ al = gfs2_alloc_get(ip);
+ if (al == NULL)
+ goto out_unlock;
+
+ ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (ret)
+ goto out_alloc_put;
+ ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+ if (ret)
+ goto out_quota_unlock;
+ al->al_requested = data_blocks + ind_blocks;
+ ret = gfs2_inplace_reserve(ip);
+ if (ret)
+ goto out_quota_unlock;
+
+ rblocks = RES_DINODE + ind_blocks;
+ if (gfs2_is_jdata(ip))
+ rblocks += data_blocks ? data_blocks : 1;
+ if (ind_blocks || data_blocks)
+ rblocks += RES_STATFS + RES_QUOTA;
+ ret = gfs2_trans_begin(sdp, rblocks, 0);
+ if (ret)
+ goto out_trans_fail;
+
+ lock_page(page);
+ ret = -EINVAL;
+ last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+ if (page->index > last_index)
+ goto out_unlock_page;
+ ret = 0;
+ if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+ goto out_unlock_page;
+ if (gfs2_is_stuffed(ip)) {
+ ret = gfs2_unstuff_dinode(ip, page);
+ if (ret)
+ goto out_unlock_page;
+ }
+ ret = gfs2_allocate_page_backing(page);
+
+out_unlock_page:
+ unlock_page(page);
+ gfs2_trans_end(sdp);
+out_trans_fail:
+ gfs2_inplace_release(ip);
+out_quota_unlock:
+ gfs2_quota_unlock(ip);
+out_alloc_put:
+ gfs2_alloc_put(ip);
+out_unlock:
+ gfs2_glock_dq(&gh);
+out:
+ gfs2_holder_uninit(&gh);
+ return ret;
+}
+
+static struct vm_operations_struct gfs2_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = gfs2_page_mkwrite,
+};
+
/**
* gfs2_mmap -
@@ -360,14 +447,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
return error;
}
- /* This is VM_MAYWRITE instead of VM_WRITE because a call
- to mprotect() can turn on VM_WRITE later. */
-
- if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
- (VM_MAYSHARE | VM_MAYWRITE))
- vma->vm_ops = &gfs2_vm_ops_sharewrite;
- else
- vma->vm_ops = &gfs2_vm_ops_private;
+ vma->vm_ops = &gfs2_vm_ops;
gfs2_glock_dq_uninit(&i_gh);
@@ -538,15 +618,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (__mandatory_lock(&ip->i_inode))
return -ENOLCK;
- if (sdp->sd_args.ar_localflocks) {
- if (IS_GETLK(cmd)) {
- posix_test_lock(file, fl);
- return 0;
- } else {
- return posix_lock_file_wait(file, fl);
- }
- }
-
if (cmd == F_CANCELLK) {
/* Hack: */
cmd = F_SETLK;
@@ -632,16 +703,12 @@ static void do_unflock(struct file *file, struct file_lock *fl)
static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
- struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
if (__mandatory_lock(&ip->i_inode))
return -ENOLCK;
- if (sdp->sd_args.ar_localflocks)
- return flock_lock_file_wait(file, fl);
-
if (fl->fl_type == F_UNLCK) {
do_unflock(file, fl);
return 0;
@@ -678,3 +745,27 @@ const struct file_operations gfs2_dir_fops = {
.flock = gfs2_flock,
};
+const struct file_operations gfs2_file_fops_nolock = {
+ .llseek = gfs2_llseek,
+ .read = do_sync_read,
+ .aio_read = generic_file_aio_read,
+ .write = do_sync_write,
+ .aio_write = generic_file_aio_write,
+ .unlocked_ioctl = gfs2_ioctl,
+ .mmap = gfs2_mmap,
+ .open = gfs2_open,
+ .release = gfs2_close,
+ .fsync = gfs2_fsync,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+ .setlease = gfs2_setlease,
+};
+
+const struct file_operations gfs2_dir_fops_nolock = {
+ .readdir = gfs2_readdir,
+ .unlocked_ioctl = gfs2_ioctl,
+ .open = gfs2_open,
+ .release = gfs2_close,
+ .fsync = gfs2_fsync,
+};
+
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
deleted file mode 100644
index 7e5d8ec9c84..00000000000
--- a/fs/gfs2/ops_file.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_FILE_DOT_H__
-#define __OPS_FILE_DOT_H__
-
-#include <linux/fs.h>
-struct gfs2_inode;
-
-extern struct file gfs2_internal_file_sentinel;
-extern int gfs2_internal_read(struct gfs2_inode *ip,
- struct file_ra_state *ra_state,
- char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_inode_flags(struct inode *inode);
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-
-#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 17de58e83d9..43d511bba52 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -21,6 +21,7 @@
#include "gfs2.h"
#include "incore.h"
+#include "bmap.h"
#include "daemon.h"
#include "glock.h"
#include "glops.h"
@@ -59,7 +60,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_inum_mutex);
spin_lock_init(&sdp->sd_statfs_spin);
- mutex_init(&sdp->sd_statfs_mutex);
spin_lock_init(&sdp->sd_rindex_spin);
mutex_init(&sdp->sd_rindex_mutex);
@@ -77,7 +77,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
spin_lock_init(&sdp->sd_log_lock);
- INIT_LIST_HEAD(&sdp->sd_log_le_gl);
INIT_LIST_HEAD(&sdp->sd_log_le_buf);
INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
INIT_LIST_HEAD(&sdp->sd_log_le_rg);
@@ -303,6 +302,67 @@ out:
return error;
}
+/**
+ * map_journal_extents - create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal. This will save
+ * us time when writing journal blocks. Most journals will have only one
+ * extent that maps all their logical blocks. That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential. Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out. But it's still possible. These journals might have
+ * several extents.
+ *
+ * TODO: This should be done in bigger chunks rather than one block at a time,
+ * but since it's only done at mount time, I'm not worried about the
+ * time it takes.
+ */
+static int map_journal_extents(struct gfs2_sbd *sdp)
+{
+ struct gfs2_jdesc *jd = sdp->sd_jdesc;
+ unsigned int lb;
+ u64 db, prev_db; /* logical block, disk block, prev disk block */
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_journal_extent *jext = NULL;
+ struct buffer_head bh;
+ int rc = 0;
+
+ prev_db = 0;
+
+ for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+ bh.b_state = 0;
+ bh.b_blocknr = 0;
+ bh.b_size = 1 << ip->i_inode.i_blkbits;
+ rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
+ db = bh.b_blocknr;
+ if (rc || !db) {
+ printk(KERN_INFO "GFS2 journal mapping error %d: lb="
+ "%u db=%llu\n", rc, lb, (unsigned long long)db);
+ break;
+ }
+ if (!prev_db || db != prev_db + 1) {
+ jext = kzalloc(sizeof(struct gfs2_journal_extent),
+ GFP_KERNEL);
+ if (!jext) {
+ printk(KERN_INFO "GFS2 error: out of memory "
+ "mapping journal extents.\n");
+ rc = -ENOMEM;
+ break;
+ }
+ jext->dblock = db;
+ jext->lblock = lb;
+ jext->blocks = 1;
+ list_add_tail(&jext->extent_list, &jd->extent_list);
+ } else {
+ jext->blocks++;
+ }
+ prev_db = db;
+ }
+ return rc;
+}
+
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct gfs2_holder ji_gh;
@@ -340,7 +400,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (sdp->sd_args.ar_spectator) {
sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
- sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+ atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
} else {
if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
fs_err(sdp, "can't mount journal #%u\n",
@@ -377,7 +437,10 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
sdp->sd_jdesc->jd_jid, error);
goto fail_jinode_gh;
}
- sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+ atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+
+ /* Map the extents for this journal's blocks */
+ map_journal_extents(sdp);
}
if (sdp->sd_lockstruct.ls_first) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 291f0c7eaa3..9f71372c175 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -61,7 +61,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
if (!IS_ERR(inode)) {
gfs2_trans_end(sdp);
- if (dip->i_alloc.al_rgd)
+ if (dip->i_alloc->al_rgd)
gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
gfs2_alloc_put(dip);
@@ -113,8 +113,18 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
if (inode && IS_ERR(inode))
return ERR_PTR(PTR_ERR(inode));
- if (inode)
+ if (inode) {
+ struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+ struct gfs2_holder gh;
+ int error;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
+ }
+ gfs2_glock_dq_uninit(&gh);
return d_splice_alias(inode, dentry);
+ }
d_add(dentry, inode);
return NULL;
@@ -366,7 +376,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
}
gfs2_trans_end(sdp);
- if (dip->i_alloc.al_rgd)
+ if (dip->i_alloc->al_rgd)
gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
gfs2_alloc_put(dip);
@@ -442,7 +452,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
gfs2_trans_end(sdp);
- if (dip->i_alloc.al_rgd)
+ if (dip->i_alloc->al_rgd)
gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
gfs2_alloc_put(dip);
@@ -548,7 +558,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
}
gfs2_trans_end(sdp);
- if (dip->i_alloc.al_rgd)
+ if (dip->i_alloc->al_rgd)
gfs2_inplace_release(dip);
gfs2_quota_unlock(dip);
gfs2_alloc_put(dip);
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index 34f0caac1a0..fd8cee231e1 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -16,5 +16,11 @@ extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
extern const struct inode_operations gfs2_symlink_iops;
extern const struct inode_operations gfs2_dev_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 950f31460e8..5e524217944 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -487,7 +487,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
if (ip) {
ip->i_flags = 0;
ip->i_gl = NULL;
- ip->i_last_pfault = jiffies;
}
return &ip->i_inode;
}
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
deleted file mode 100644
index 927d739d468..00000000000
--- a/fs/gfs2/ops_vm.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "glock.h"
-#include "inode.h"
-#include "ops_vm.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-
-static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
-
- set_bit(GIF_PAGED, &ip->i_flags);
- return filemap_fault(vma, vmf);
-}
-
-static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned long index = page->index;
- u64 lblock = index << (PAGE_CACHE_SHIFT -
- sdp->sd_sb.sb_bsize_shift);
- unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
- struct gfs2_alloc *al;
- unsigned int data_blocks, ind_blocks;
- unsigned int x;
- int error;
-
- al = gfs2_alloc_get(ip);
-
- error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
- if (error)
- goto out;
-
- error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
- if (error)
- goto out_gunlock_q;
-
- gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-
- al->al_requested = data_blocks + ind_blocks;
-
- error = gfs2_inplace_reserve(ip);
- if (error)
- goto out_gunlock_q;
-
- error = gfs2_trans_begin(sdp, al->al_rgd->rd_length +
- ind_blocks + RES_DINODE +
- RES_STATFS + RES_QUOTA, 0);
- if (error)
- goto out_ipres;
-
- if (gfs2_is_stuffed(ip)) {
- error = gfs2_unstuff_dinode(ip, NULL);
- if (error)
- goto out_trans;
- }
-
- for (x = 0; x < blocks; ) {
- u64 dblock;
- unsigned int extlen;
- int new = 1;
-
- error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
- if (error)
- goto out_trans;
-
- lblock += extlen;
- x += extlen;
- }
-
- gfs2_assert_warn(sdp, al->al_alloced);
-
-out_trans:
- gfs2_trans_end(sdp);
-out_ipres:
- gfs2_inplace_release(ip);
-out_gunlock_q:
- gfs2_quota_unlock(ip);
-out:
- gfs2_alloc_put(ip);
- return error;
-}
-
-static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
- struct vm_fault *vmf)
-{
- struct file *file = vma->vm_file;
- struct gfs2_file *gf = file->private_data;
- struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
- struct gfs2_holder i_gh;
- int alloc_required;
- int error;
- int ret = 0;
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
- if (error)
- goto out;
-
- set_bit(GIF_PAGED, &ip->i_flags);
- set_bit(GIF_SW_PAGED, &ip->i_flags);
-
- error = gfs2_write_alloc_required(ip,
- (u64)vmf->pgoff << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, &alloc_required);
- if (error) {
- ret = VM_FAULT_OOM; /* XXX: are these right? */
- goto out_unlock;
- }
-
- set_bit(GFF_EXLOCK, &gf->f_flags);
- ret = filemap_fault(vma, vmf);
- clear_bit(GFF_EXLOCK, &gf->f_flags);
- if (ret & VM_FAULT_ERROR)
- goto out_unlock;
-
- if (alloc_required) {
- /* XXX: do we need to drop page lock around alloc_page_backing?*/
- error = alloc_page_backing(ip, vmf->page);
- if (error) {
- /*
- * VM_FAULT_LOCKED should always be the case for
- * filemap_fault, but it may not be in a future
- * implementation.
- */
- if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf->page);
- page_cache_release(vmf->page);
- ret = VM_FAULT_OOM;
- goto out_unlock;
- }
- set_page_dirty(vmf->page);
- }
-
-out_unlock:
- gfs2_glock_dq_uninit(&i_gh);
-out:
- return ret;
-}
-
-struct vm_operations_struct gfs2_vm_ops_private = {
- .fault = gfs2_private_fault,
-};
-
-struct vm_operations_struct gfs2_vm_ops_sharewrite = {
- .fault = gfs2_sharewrite_fault,
-};
-
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
deleted file mode 100644
index 4ae8f43ed5e..00000000000
--- a/fs/gfs2/ops_vm.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_VM_DOT_H__
-#define __OPS_VM_DOT_H__
-
-#include <linux/mm.h>
-
-extern struct vm_operations_struct gfs2_vm_ops_private;
-extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
-
-#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index addb51e0f13..a08dabd6ce9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -59,7 +59,6 @@
#include "super.h"
#include "trans.h"
#include "inode.h"
-#include "ops_file.h"
#include "ops_address.h"
#include "util.h"
@@ -274,10 +273,10 @@ static int bh_get(struct gfs2_quota_data *qd)
}
block = qd->qd_slot / sdp->sd_qc_per_block;
- offset = qd->qd_slot % sdp->sd_qc_per_block;;
+ offset = qd->qd_slot % sdp->sd_qc_per_block;
bh_map.b_size = 1 << ip->i_inode.i_blkbits;
- error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
+ error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
if (error)
goto fail;
error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
@@ -454,7 +453,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_quota_data **qd = al->al_qd;
int error;
@@ -502,7 +501,7 @@ out:
void gfs2_quota_unhold(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
unsigned int x;
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
@@ -646,7 +645,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
}
if (!buffer_mapped(bh)) {
- gfs2_get_block(inode, iblock, bh, 1);
+ gfs2_block_map(inode, iblock, bh, 1);
if (!buffer_mapped(bh))
goto unlock;
}
@@ -793,11 +792,9 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder i_gh;
struct gfs2_quota_host q;
char buf[sizeof(struct gfs2_quota)];
- struct file_ra_state ra_state;
int error;
struct gfs2_quota_lvb *qlvb;
- file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
restart:
error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
if (error)
@@ -820,8 +817,8 @@ restart:
memset(buf, 0, sizeof(struct gfs2_quota));
pos = qd2offset(qd);
- error = gfs2_internal_read(ip, &ra_state, buf,
- &pos, sizeof(struct gfs2_quota));
+ error = gfs2_internal_read(ip, NULL, buf, &pos,
+ sizeof(struct gfs2_quota));
if (error < 0)
goto fail_gunlock;
@@ -856,7 +853,7 @@ fail:
int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
unsigned int x;
int error = 0;
@@ -924,7 +921,7 @@ static int need_sync(struct gfs2_quota_data *qd)
void gfs2_quota_unlock(struct gfs2_inode *ip)
{
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_quota_data *qda[4];
unsigned int count = 0;
unsigned int x;
@@ -972,7 +969,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_quota_data *qd;
s64 value;
unsigned int x;
@@ -1016,10 +1013,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 uid, u32 gid)
{
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_quota_data *qd;
unsigned int x;
- unsigned int found = 0;
if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
return;
@@ -1032,7 +1028,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
(qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
do_qc(qd, change);
- found++;
}
}
}
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index beb6c7ac008..b249e294a95 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -391,7 +391,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
lblock = head->lh_blkno;
gfs2_replay_incr_blk(sdp, &lblock);
bh_map.b_size = 1 << ip->i_inode.i_blkbits;
- error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
+ error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
if (error)
return error;
if (!bh_map.b_blocknr) {
@@ -504,13 +504,21 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
ro = 1;
} else {
- if (sdp->sd_vfs->s_flags & MS_RDONLY)
- ro = 1;
+ if (sdp->sd_vfs->s_flags & MS_RDONLY) {
+ /* check if device itself is read-only */
+ ro = bdev_read_only(sdp->sd_vfs->s_bdev);
+ if (!ro) {
+ fs_info(sdp, "recovery required on "
+ "read-only filesystem.\n");
+ fs_info(sdp, "write access will be "
+ "enabled during recovery.\n");
+ }
+ }
}
if (ro) {
- fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
- jd->jd_jid);
+ fs_warn(sdp, "jid=%u: Can't replay: read-only block "
+ "device\n", jd->jd_jid);
error = -EROFS;
goto fail_gunlock_tr;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 708c287e1d0..3552110b2e5 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -25,10 +25,10 @@
#include "rgrp.h"
#include "super.h"
#include "trans.h"
-#include "ops_file.h"
#include "util.h"
#include "log.h"
#include "inode.h"
+#include "ops_address.h"
#define BFITNOENT ((u32)~0)
#define NO_BLOCK ((u64)~0)
@@ -126,41 +126,43 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
* Return: the block number (bitmap buffer scope) that was found
*/
-static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
- unsigned int buflen, u32 goal,
- unsigned char old_state)
+static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
+ unsigned char old_state)
{
- unsigned char *byte, *end, alloc;
+ unsigned char *byte;
u32 blk = goal;
- unsigned int bit;
+ unsigned int bit, bitlong;
+ unsigned long *plong, plong55;
byte = buffer + (goal / GFS2_NBBY);
+ plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
- end = buffer + buflen;
- alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0;
-
- while (byte < end) {
- /* If we're looking for a free block we can eliminate all
- bitmap settings with 0x55, which represents four data
- blocks in a row. If we're looking for a data block, we can
- eliminate 0x00 which corresponds to four free blocks. */
- if ((*byte & 0x55) == alloc) {
- blk += (8 - bit) >> 1;
-
- bit = 0;
- byte++;
-
+ bitlong = bit;
+#if BITS_PER_LONG == 32
+ plong55 = 0x55555555;
+#else
+ plong55 = 0x5555555555555555;
+#endif
+ while (byte < buffer + buflen) {
+
+ if (bitlong == 0 && old_state == 0 && *plong == plong55) {
+ plong++;
+ byte += sizeof(unsigned long);
+ blk += sizeof(unsigned long) * GFS2_NBBY;
continue;
}
-
if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
return blk;
-
bit += GFS2_BIT_SIZE;
if (bit >= 8) {
bit = 0;
byte++;
}
+ bitlong += GFS2_BIT_SIZE;
+ if (bitlong >= sizeof(unsigned long) * 8) {
+ bitlong = 0;
+ plong++;
+ }
blk++;
}
@@ -817,11 +819,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
{
- struct gfs2_alloc *al = &ip->i_alloc;
-
- /* FIXME: Should assert that the correct locks are held here... */
- memset(al, 0, sizeof(*al));
- return al;
+ BUG_ON(ip->i_alloc != NULL);
+ ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
+ return ip->i_alloc;
}
/**
@@ -1059,26 +1059,34 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
struct inode *inode = NULL;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
int flags = LM_FLAG_TRY;
int skipped = 0;
int loops = 0;
- int error;
+ int error, rg_locked;
/* Try recently successful rgrps */
rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
while (rgd) {
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_TRY, &al->al_rgd_gh);
+ rg_locked = 0;
+
+ if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+ rg_locked = 1;
+ error = 0;
+ } else {
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+ LM_FLAG_TRY, &al->al_rgd_gh);
+ }
switch (error) {
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
if (rgd->rd_flags & GFS2_RDF_CHECK)
inode = try_rgrp_unlink(rgd, last_unlinked);
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ if (!rg_locked)
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
return inode;
rgd = recent_rgrp_next(rgd, 1);
@@ -1098,15 +1106,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
begin = rgd = forward_rgrp_get(sdp);
for (;;) {
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
- &al->al_rgd_gh);
+ rg_locked = 0;
+
+ if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+ rg_locked = 1;
+ error = 0;
+ } else {
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+ &al->al_rgd_gh);
+ }
switch (error) {
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
if (rgd->rd_flags & GFS2_RDF_CHECK)
inode = try_rgrp_unlink(rgd, last_unlinked);
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ if (!rg_locked)
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
return inode;
break;
@@ -1158,7 +1174,7 @@ out:
int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct inode *inode;
int error = 0;
u64 last_unlinked = NO_BLOCK;
@@ -1204,7 +1220,7 @@ try_again:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
fs_warn(sdp, "al_alloced = %u, al_requested = %u "
@@ -1213,7 +1229,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
al->al_line);
al->al_rgd = NULL;
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ if (al->al_rgd_gh.gh_gl)
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (ip != GFS2_I(sdp->sd_rindex))
gfs2_glock_dq_uninit(&al->al_ri_gh);
}
@@ -1301,11 +1318,10 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
bitmaps, so we must search the originals for that. */
if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
- blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+ blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
bi->bi_len, goal, old_state);
else
- blk = gfs2_bitfit(rgd,
- bi->bi_bh->b_data + bi->bi_offset,
+ blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
bi->bi_len, goal, old_state);
if (blk != BFITNOENT)
break;
@@ -1394,7 +1410,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
u64 gfs2_alloc_data(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_rgrpd *rgd = al->al_rgd;
u32 goal, blk;
u64 block;
@@ -1439,7 +1455,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
u64 gfs2_alloc_meta(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_alloc *al = ip->i_alloc;
struct gfs2_rgrpd *rgd = al->al_rgd;
u32 goal, blk;
u64 block;
@@ -1485,7 +1501,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct gfs2_alloc *al = &dip->i_alloc;
+ struct gfs2_alloc *al = dip->i_alloc;
struct gfs2_rgrpd *rgd = al->al_rgd;
u32 blk;
u64 block;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4c6adfc6f2..149bb161f4b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -32,7 +32,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
static inline void gfs2_alloc_put(struct gfs2_inode *ip)
{
- return; /* So we can see where ip->i_alloc is used */
+ BUG_ON(ip->i_alloc == NULL);
+ kfree(ip->i_alloc);
+ ip->i_alloc = NULL;
}
int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index dd3e737f528..ef0562c3bc7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -51,13 +51,9 @@ void gfs2_tune_init(struct gfs2_tune *gt)
{
spin_lock_init(&gt->gt_spin);
- gt->gt_ilimit = 100;
- gt->gt_ilimit_tries = 3;
- gt->gt_ilimit_min = 1;
gt->gt_demote_secs = 300;
gt->gt_incore_log_blocks = 1024;
gt->gt_log_flush_secs = 60;
- gt->gt_jindex_refresh_secs = 60;
gt->gt_recoverd_secs = 60;
gt->gt_logd_secs = 1;
gt->gt_quotad_secs = 5;
@@ -71,10 +67,8 @@ void gfs2_tune_init(struct gfs2_tune *gt)
gt->gt_new_files_jdata = 0;
gt->gt_new_files_directio = 0;
gt->gt_max_readahead = 1 << 18;
- gt->gt_lockdump_size = 131072;
gt->gt_stall_secs = 600;
gt->gt_complain_secs = 10;
- gt->gt_reclaim_limit = 5000;
gt->gt_statfs_quantum = 30;
gt->gt_statfs_slow = 0;
}
@@ -393,6 +387,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
if (!jd)
break;
+ INIT_LIST_HEAD(&jd->extent_list);
jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
if (!jd->jd_inode)
@@ -422,8 +417,9 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
void gfs2_jindex_free(struct gfs2_sbd *sdp)
{
- struct list_head list;
+ struct list_head list, *head;
struct gfs2_jdesc *jd;
+ struct gfs2_journal_extent *jext;
spin_lock(&sdp->sd_jindex_spin);
list_add(&list, &sdp->sd_jindex_list);
@@ -433,6 +429,14 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
while (!list_empty(&list)) {
jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+ head = &jd->extent_list;
+ while (!list_empty(head)) {
+ jext = list_entry(head->next,
+ struct gfs2_journal_extent,
+ extent_list);
+ list_del(&jext->extent_list);
+ kfree(jext);
+ }
list_del(&jd->jd_list);
iput(jd->jd_inode);
kfree(jd);
@@ -543,7 +547,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
if (error)
return error;
- gfs2_meta_cache_flush(ip);
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -686,9 +689,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
if (error)
return;
- mutex_lock(&sdp->sd_statfs_mutex);
gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
- mutex_unlock(&sdp->sd_statfs_mutex);
spin_lock(&sdp->sd_statfs_spin);
l_sc->sc_total += total;
@@ -736,9 +737,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
if (error)
goto out_bh2;
- mutex_lock(&sdp->sd_statfs_mutex);
gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
- mutex_unlock(&sdp->sd_statfs_mutex);
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d9..eaa3b7b2f99 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -32,7 +32,8 @@ spinlock_t gfs2_sys_margs_lock;
static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+ return snprintf(buf, PAGE_SIZE, "%u:%u\n",
+ MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
}
static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
@@ -221,9 +222,7 @@ static struct kobj_type gfs2_ktype = {
.sysfs_ops = &gfs2_attr_ops,
};
-static struct kset gfs2_kset = {
- .ktype = &gfs2_ktype,
-};
+static struct kset *gfs2_kset;
/*
* display struct lm_lockstruct fields
@@ -427,13 +426,11 @@ TUNE_ATTR_2(name, name##_store)
TUNE_ATTR(demote_secs, 0);
TUNE_ATTR(incore_log_blocks, 0);
TUNE_ATTR(log_flush_secs, 0);
-TUNE_ATTR(jindex_refresh_secs, 0);
TUNE_ATTR(quota_warn_period, 0);
TUNE_ATTR(quota_quantum, 0);
TUNE_ATTR(atime_quantum, 0);
TUNE_ATTR(max_readahead, 0);
TUNE_ATTR(complain_secs, 0);
-TUNE_ATTR(reclaim_limit, 0);
TUNE_ATTR(statfs_slow, 0);
TUNE_ATTR(new_files_jdata, 0);
TUNE_ATTR(new_files_directio, 0);
@@ -450,13 +447,11 @@ static struct attribute *tune_attrs[] = {
&tune_attr_demote_secs.attr,
&tune_attr_incore_log_blocks.attr,
&tune_attr_log_flush_secs.attr,
- &tune_attr_jindex_refresh_secs.attr,
&tune_attr_quota_warn_period.attr,
&tune_attr_quota_quantum.attr,
&tune_attr_atime_quantum.attr,
&tune_attr_max_readahead.attr,
&tune_attr_complain_secs.attr,
- &tune_attr_reclaim_limit.attr,
&tune_attr_statfs_slow.attr,
&tune_attr_quota_simul_sync.attr,
&tune_attr_quota_cache_secs.attr,
@@ -495,14 +490,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
{
int error;
- sdp->sd_kobj.kset = &gfs2_kset;
- sdp->sd_kobj.ktype = &gfs2_ktype;
-
- error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
- if (error)
- goto fail;
-
- error = kobject_register(&sdp->sd_kobj);
+ sdp->sd_kobj.kset = gfs2_kset;
+ error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
+ "%s", sdp->sd_table_name);
if (error)
goto fail;
@@ -522,6 +512,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
if (error)
goto fail_args;
+ kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
return 0;
fail_args:
@@ -531,7 +522,7 @@ fail_counters:
fail_lockstruct:
sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
fail_reg:
- kobject_unregister(&sdp->sd_kobj);
+ kobject_put(&sdp->sd_kobj);
fail:
fs_err(sdp, "error %d adding sysfs files", error);
return error;
@@ -543,21 +534,22 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
sysfs_remove_group(&sdp->sd_kobj, &args_group);
sysfs_remove_group(&sdp->sd_kobj, &counters_group);
sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
- kobject_unregister(&sdp->sd_kobj);
+ kobject_put(&sdp->sd_kobj);
}
int gfs2_sys_init(void)
{
gfs2_sys_margs = NULL;
spin_lock_init(&gfs2_sys_margs_lock);
- kobject_set_name(&gfs2_kset.kobj, "gfs2");
- kobj_set_kset_s(&gfs2_kset, fs_subsys);
- return kset_register(&gfs2_kset);
+ gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
+ if (!gfs2_kset)
+ return -ENOMEM;
+ return 0;
}
void gfs2_sys_uninit(void)
{
kfree(gfs2_sys_margs);
- kset_unregister(&gfs2_kset);
+ kset_unregister(gfs2_kset);
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 717983e2c2a..73e5d92a657 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -114,11 +114,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
gfs2_log_flush(sdp, NULL);
}
-void gfs2_trans_add_gl(struct gfs2_glock *gl)
-{
- lops_add(gl->gl_sbd, &gl->gl_le);
-}
-
/**
* gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
* @gl: the glock the buffer belongs to
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 043d5f4b9c4..e826f0dab80 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -30,7 +30,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
void gfs2_trans_end(struct gfs2_sbd *sdp);
-void gfs2_trans_add_gl(struct gfs2_glock *gl);
void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 31284c77bba..110dd3515dc 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -61,7 +61,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
mapping = tree->inode->i_mapping;
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
- goto free_tree;
+ goto free_inode;
/* Load the header */
head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -99,11 +99,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
page_cache_release(page);
return tree;
- fail_page:
+fail_page:
page_cache_release(page);
- free_tree:
+free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
iput(tree->inode);
+free_tree:
kfree(tree);
return NULL;
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7028c..038ed743619 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -288,10 +288,12 @@ handle_t *journal_start(journal_t *journal, int nblocks)
jbd_free_handle(handle);
current->journal_info = NULL;
handle = ERR_PTR(err);
+ goto out;
}
lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+out:
return handle;
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index df25ecc418a..4dcc0581999 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -284,11 +284,11 @@ static struct dir_table_slot *find_index(struct inode *ip, u32 index,
release_metapage(*mp);
*mp = NULL;
}
- if (*mp == 0) {
+ if (!(*mp)) {
*lblock = blkno;
*mp = read_index_page(ip, blkno);
}
- if (*mp == 0) {
+ if (!(*mp)) {
jfs_err("free_index: error reading directory table");
return NULL;
}
@@ -413,7 +413,8 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
}
ip->i_size = PSIZE;
- if ((mp = get_index_page(ip, 0)) == 0) {
+ mp = get_index_page(ip, 0);
+ if (!mp) {
jfs_err("add_index: get_metapage failed!");
xtTruncate(tid, ip, 0, COMMIT_PWMAP);
memcpy(&jfs_ip->i_dirtable, temp_table,
@@ -461,7 +462,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
} else
mp = read_index_page(ip, blkno);
- if (mp == 0) {
+ if (!mp) {
jfs_err("add_index: get/read_metapage failed!");
goto clean_up;
}
@@ -499,7 +500,7 @@ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
dirtab_slot = find_index(ip, index, &mp, &lblock);
- if (dirtab_slot == 0)
+ if (!dirtab_slot)
return;
dirtab_slot->flag = DIR_INDEX_FREE;
@@ -526,7 +527,7 @@ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
dirtab_slot = find_index(ip, index, mp, lblock);
- if (dirtab_slot == 0)
+ if (!dirtab_slot)
return;
DTSaddress(dirtab_slot, bn);
@@ -552,7 +553,7 @@ static int read_index(struct inode *ip, u32 index,
struct dir_table_slot *slot;
slot = find_index(ip, index, &mp, &lblock);
- if (slot == 0) {
+ if (!slot) {
return -EIO;
}
@@ -592,10 +593,8 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
struct component_name ciKey;
struct super_block *sb = ip->i_sb;
- ciKey.name =
- (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
- GFP_NOFS);
- if (ciKey.name == 0) {
+ ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
+ if (!ciKey.name) {
rc = -ENOMEM;
goto dtSearch_Exit2;
}
@@ -957,10 +956,8 @@ static int dtSplitUp(tid_t tid,
smp = split->mp;
sp = DT_PAGE(ip, smp);
- key.name =
- (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
- GFP_NOFS);
- if (key.name == 0) {
+ key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
+ if (!key.name) {
DT_PUTPAGE(smp);
rc = -ENOMEM;
goto dtSplitUp_Exit;
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 8561c6ecece..cdac2d5bafe 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -74,7 +74,7 @@ struct idtentry {
#define DTIHDRDATALEN 11
/* compute number of slots for entry */
-#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 )
+#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
/*
@@ -133,7 +133,7 @@ struct dir_table_slot {
( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
/* compute number of slots for entry */
-#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 )
+#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15))
#define NDTLEAF NDTINTERNAL
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3870ba8b908..9bf29f77173 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -381,7 +381,7 @@ int diRead(struct inode *ip)
/* read the page of disk inode */
mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
- if (mp == 0) {
+ if (!mp) {
jfs_err("diRead: read_metapage failed");
return -EIO;
}
@@ -654,7 +654,7 @@ int diWrite(tid_t tid, struct inode *ip)
/* read the page of disk inode */
retry:
mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
- if (mp == 0)
+ if (!mp)
return -EIO;
/* get the pointer to the disk inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 15a3974cdee..325a9679b95 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -208,6 +208,17 @@ static struct lmStat {
} lmStat;
#endif
+static void write_special_inodes(struct jfs_log *log,
+ int (*writer)(struct address_space *))
+{
+ struct jfs_sb_info *sbi;
+
+ list_for_each_entry(sbi, &log->sb_list, log_list) {
+ writer(sbi->ipbmap->i_mapping);
+ writer(sbi->ipimap->i_mapping);
+ writer(sbi->direct_inode->i_mapping);
+ }
+}
/*
* NAME: lmLog()
@@ -935,22 +946,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
struct lrd lrd;
int lsn;
struct logsyncblk *lp;
- struct jfs_sb_info *sbi;
unsigned long flags;
/* push dirty metapages out to disk */
if (hard_sync)
- list_for_each_entry(sbi, &log->sb_list, log_list) {
- filemap_fdatawrite(sbi->ipbmap->i_mapping);
- filemap_fdatawrite(sbi->ipimap->i_mapping);
- filemap_fdatawrite(sbi->direct_inode->i_mapping);
- }
+ write_special_inodes(log, filemap_fdatawrite);
else
- list_for_each_entry(sbi, &log->sb_list, log_list) {
- filemap_flush(sbi->ipbmap->i_mapping);
- filemap_flush(sbi->ipimap->i_mapping);
- filemap_flush(sbi->direct_inode->i_mapping);
- }
+ write_special_inodes(log, filemap_flush);
/*
* forward syncpt
@@ -1536,7 +1538,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
{
int i;
struct tblock *target = NULL;
- struct jfs_sb_info *sbi;
/* jfs_write_inode may call us during read-only mount */
if (!log)
@@ -1598,11 +1599,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
if (wait < 2)
return;
- list_for_each_entry(sbi, &log->sb_list, log_list) {
- filemap_fdatawrite(sbi->ipbmap->i_mapping);
- filemap_fdatawrite(sbi->ipimap->i_mapping);
- filemap_fdatawrite(sbi->direct_inode->i_mapping);
- }
+ write_special_inodes(log, filemap_fdatawrite);
/*
* If there was recent activity, we may need to wait
@@ -1611,6 +1608,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
for (i = 0; i < 200; i++) { /* Too much? */
msleep(250);
+ write_special_inodes(log, filemap_fdatawrite);
if (list_empty(&log->cqueue) &&
list_empty(&log->synclist))
break;
@@ -2347,7 +2345,7 @@ int jfsIOWait(void *arg)
do {
spin_lock_irq(&log_redrive_lock);
- while ((bp = log_redrive_list) != 0) {
+ while ((bp = log_redrive_list)) {
log_redrive_list = bp->l_redrive_next;
bp->l_redrive_next = NULL;
spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index f5cd8d38af7..d1e64f2f2fc 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -39,11 +39,11 @@ static struct {
#endif
#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
-#define trylock_metapage(mp) test_and_set_bit(META_locked, &(mp)->flag)
+#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
static inline void unlock_metapage(struct metapage *mp)
{
- clear_bit(META_locked, &mp->flag);
+ clear_bit_unlock(META_locked, &mp->flag);
wake_up(&mp->wait);
}
@@ -88,7 +88,7 @@ struct meta_anchor {
};
#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
-static inline struct metapage *page_to_mp(struct page *page, uint offset)
+static inline struct metapage *page_to_mp(struct page *page, int offset)
{
if (!PagePrivate(page))
return NULL;
@@ -153,7 +153,7 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
}
#else
-static inline struct metapage *page_to_mp(struct page *page, uint offset)
+static inline struct metapage *page_to_mp(struct page *page, int offset)
{
return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
}
@@ -249,7 +249,7 @@ static inline void drop_metapage(struct page *page, struct metapage *mp)
*/
static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
- unsigned int *len)
+ int *len)
{
int rc = 0;
int xflag;
@@ -352,25 +352,27 @@ static void metapage_write_end_io(struct bio *bio, int err)
static int metapage_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio = NULL;
- unsigned int block_offset; /* block offset of mp within page */
+ int block_offset; /* block offset of mp within page */
struct inode *inode = page->mapping->host;
- unsigned int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
- unsigned int len;
- unsigned int xlen;
+ int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
+ int len;
+ int xlen;
struct metapage *mp;
int redirty = 0;
sector_t lblock;
+ int nr_underway = 0;
sector_t pblock;
sector_t next_block = 0;
sector_t page_start;
unsigned long bio_bytes = 0;
unsigned long bio_offset = 0;
- unsigned int offset;
+ int offset;
page_start = (sector_t)page->index <<
(PAGE_CACHE_SHIFT - inode->i_blkbits);
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
@@ -413,11 +415,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
if (!bio->bi_size)
goto dump_bio;
submit_bio(WRITE, bio);
+ nr_underway++;
bio = NULL;
- } else {
- set_page_writeback(page);
+ } else
inc_io(page);
- }
xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
pblock = metapage_get_blocks(inode, lblock, &xlen);
if (!pblock) {
@@ -427,7 +428,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
continue;
}
set_bit(META_io, &mp->flag);
- len = min(xlen, (uint) JFS_SBI(inode->i_sb)->nbperpage);
+ len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_bdev = inode->i_sb->s_bdev;
@@ -449,12 +450,16 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
goto dump_bio;
submit_bio(WRITE, bio);
+ nr_underway++;
}
if (redirty)
redirty_page_for_writepage(wbc, page);
unlock_page(page);
+ if (nr_underway == 0)
+ end_page_writeback(page);
+
return 0;
add_failed:
/* We should never reach here, since we're only adding one vec */
@@ -475,13 +480,13 @@ static int metapage_readpage(struct file *fp, struct page *page)
{
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
- unsigned int block_offset;
- unsigned int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+ int block_offset;
+ int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
sector_t page_start; /* address of page in fs blocks */
sector_t pblock;
- unsigned int xlen;
+ int xlen;
unsigned int len;
- unsigned int offset;
+ int offset;
BUG_ON(!PageLocked(page));
page_start = (sector_t)page->index <<
@@ -530,7 +535,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
{
struct metapage *mp;
int ret = 1;
- unsigned int offset;
+ int offset;
for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 644429acb8c..7b698f2ec45 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -147,7 +147,7 @@ int jfs_mount(struct super_block *sb)
*/
if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
- if (ipaimap2 == 0) {
+ if (!ipaimap2) {
jfs_err("jfs_mount: Faild to read AGGREGATE_I");
rc = -EIO;
goto errout35;
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 7971f37534a..adcf92d3b60 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
/*
* Wait for outstanding transactions to be written to log:
*/
- jfs_flush_journal(log, 2);
+ jfs_flush_journal(log, 1);
/*
* close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
*
* remove file system from log active file system list.
*/
- jfs_flush_journal(log, 2);
+ jfs_flush_journal(log, 1);
/*
* Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4e0a8493cef..f8718de3505 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1103,8 +1103,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Make sure dest inode number (if any) is what we think it is
*/
rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
- if (rc == 0) {
- if ((new_ip == 0) || (ino != new_ip->i_ino)) {
+ if (!rc) {
+ if ((!new_ip) || (ino != new_ip->i_ino)) {
rc = -ESTALE;
goto out3;
}
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 71984ee9534..7f24a0bb08c 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -172,7 +172,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
*/
t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
<< L2BPERDMAP;
- t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50;
+ t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
newFSCKSize = t32 << sbi->l2nbperpage;
newFSCKAddress = newLogAddress - newFSCKSize;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 314bb4ff1ba..70a14001c98 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -598,6 +598,12 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_printf(seq, ",umask=%03o", sbi->umask);
if (sbi->flag & JFS_NOINTEGRITY)
seq_puts(seq, ",nointegrity");
+ if (sbi->nls_tab)
+ seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
+ if (sbi->flag & JFS_ERR_CONTINUE)
+ seq_printf(seq, ",errors=continue");
+ if (sbi->flag & JFS_ERR_PANIC)
+ seq_printf(seq, ",errors=panic");
#ifdef CONFIG_QUOTA
if (sbi->flag & JFS_USRQUOTA)
diff --git a/fs/namei.c b/fs/namei.c
index 3b993db26ce..73e2e665817 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,7 +1605,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
if (S_ISLNK(inode->i_mode))
return -ELOOP;
- if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
+ if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
return -EISDIR;
/*
@@ -1620,7 +1620,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
return -EACCES;
flag &= ~O_TRUNC;
- } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
+ } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
return -EROFS;
error = vfs_permission(nd, acc_mode);
diff --git a/fs/namespace.c b/fs/namespace.c
index 06083885b21..61bf376e29e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,8 +41,8 @@ static struct kmem_cache *mnt_cache __read_mostly;
static struct rw_semaphore namespace_sem;
/* /sys/fs */
-decl_subsys(fs, NULL, NULL);
-EXPORT_SYMBOL_GPL(fs_subsys);
+struct kobject *fs_kobj;
+EXPORT_SYMBOL_GPL(fs_kobj);
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -1861,10 +1861,9 @@ void __init mnt_init(void)
if (err)
printk(KERN_WARNING "%s: sysfs_init error: %d\n",
__FUNCTION__, err);
- err = subsystem_register(&fs_subsys);
- if (err)
- printk(KERN_WARNING "%s: subsystem_register error: %d\n",
- __FUNCTION__, err);
+ fs_kobj = kobject_create_and_add("fs", NULL);
+ if (!fs_kobj)
+ printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__);
init_rootfs();
init_mount_tree();
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2d116d2298f..f917fd25858 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -388,8 +388,11 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
* against the length which was actually received.
+ * Note that when RPCSEC/GSS (for example) is used, the
+ * data buffer can be padded so dlen might be larger
+ * than required. It must never be smaller.
*/
- if (dlen != XDR_QUADLEN(len)*4)
+ if (dlen < XDR_QUADLEN(len)*4)
return 0;
if (args->count > max_blocksize) {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 986f9b32083..b86e3658a0a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -313,8 +313,11 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
* against the length which was actually received.
+ * Note that when RPCSEC/GSS (for example) is used, the
+ * data buffer can be padded so dlen might be larger
+ * than required. It must never be smaller.
*/
- if (dlen != XDR_QUADLEN(len)*4)
+ if (dlen < XDR_QUADLEN(len)*4)
return 0;
rqstp->rq_vec[0].iov_base = (void*)p;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b..4d4ce48bb42 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
ioctl.o \
journal.o \
localalloc.o \
+ locks.o \
mmap.o \
namei.o \
+ resize.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
- vote.o
+ ver.o
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 23c8cda43f1..e6df06ac640 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
mutex_lock(&data_alloc_inode->i_mutex);
- status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
+ status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
if (status < 0) {
mlog_errno(status);
goto out_mutex;
@@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
out_unlock:
brelse(data_alloc_bh);
- ocfs2_meta_unlock(data_alloc_inode, 1);
+ ocfs2_inode_unlock(data_alloc_inode, 1);
out_mutex:
mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
- ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_mutex;
@@ -5118,7 +5118,7 @@ out_journal:
ocfs2_commit_trans(osb, handle);
out_unlock:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
out_mutex:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56f7790cad4..bc7b4cbbe8e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
#include <asm/byteorder.h>
#include <linux/swap.h>
#include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
{
int err = 0;
unsigned int ext_flags;
- u64 p_blkno, past_eof;
+ u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+ u64 p_blkno, count, past_eof;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+ err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
&ext_flags);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
+ if (max_blocks < count)
+ count = max_blocks;
+
/*
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = count << inode->i_blkbits;
+
if (!ocfs2_sparse_alloc(osb)) {
if (p_blkno == 0) {
err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct buffer_head *di_bh)
{
void *kaddr;
- unsigned int size;
+ loff_t size;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data(inode->i_sb)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %u",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
+ "Inode %llu has with inline data has bad size: %Lu",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)size);
return -EROFS;
}
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
- ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
+ ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
ret = AOP_TRUNCATED_PAGE;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
/*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
goto out_alloc;
}
- ret = ocfs2_data_lock_with_page(inode, 0, page);
- if (ret != 0) {
- if (ret == AOP_TRUNCATED_PAGE)
- unlock = 0;
- mlog_errno(ret);
- goto out_alloc;
- }
-
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
ret = ocfs2_readpage_inline(inode, page);
else
ret = block_read_full_page(page, ocfs2_get_block);
unlock = 0;
- ocfs2_data_unlock(inode, 0);
out_alloc:
up_read(&OCFS2_I(inode)->ip_alloc_sem);
-out_meta_unlock:
- ocfs2_meta_unlock(inode, 0);
+out_inode_unlock:
+ ocfs2_inode_unlock(inode, 0);
out:
if (unlock)
unlock_page(page);
@@ -331,6 +330,62 @@ out:
return ret;
}
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
+ *
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
+ */
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ int ret, err = -EIO;
+ struct inode *inode = mapping->host;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ loff_t start;
+ struct page *last;
+
+ /*
+ * Use the nonblocking flag for the dlm code to avoid page
+ * lock inversion, but don't bother with retrying.
+ */
+ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+ if (ret)
+ return err;
+
+ if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ ocfs2_inode_unlock(inode, 0);
+ return err;
+ }
+
+ /*
+ * Don't bother with inline-data. There isn't anything
+ * to read-ahead in that case anyway...
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ goto out_unlock;
+
+ /*
+ * Check whether a remote node truncated this file - we just
+ * drop out in that case as it's not worth handling here.
+ */
+ last = list_entry(pages->prev, struct page, lru);
+ start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+ if (start >= i_size_read(inode))
+ goto out_unlock;
+
+ err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+
+out_unlock:
+ up_read(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 0);
+
+ return err;
+}
+
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
* accessed concurrently from multiple nodes.
*/
if (!INODE_JOURNAL(inode)) {
- err = ocfs2_meta_lock(inode, NULL, 0);
+ err = ocfs2_inode_lock(inode, NULL, 0);
if (err) {
if (err != -ENOENT)
mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
}
if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- /*
- * We get PR data locks even for O_DIRECT. This
- * allows concurrent O_DIRECT I/O but doesn't let
- * O_DIRECT with extending and buffered zeroing writes
- * race. If they did race then the buffered zeroing
- * could be written back after the O_DIRECT I/O. It's
- * one thing to tell people not to mix buffered and
- * O_DIRECT writes, but expecting them to understand
- * that file extension is also an implicit buffered
- * write is too much. By getting the PR we force
- * writeback of the buffered zeroing before
- * proceeding.
- */
- ret = ocfs2_data_lock(inode, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- ocfs2_data_unlock(inode, 0);
- }
-
ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
inode->i_sb->s_bdev, iov, offset,
nr_segs,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io);
-out:
+
mlog_exit(ret);
return ret;
}
@@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
struct buffer_head *di_bh = NULL;
struct inode *inode = mapping->host;
- ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
return ret;
@@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_data_lock(inode, 1);
- if (ret) {
- mlog_errno(ret);
- goto out_fail;
- }
-
ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
- goto out_fail_data;
+ goto out_fail;
}
brelse(di_bh);
return 0;
-out_fail_data:
- ocfs2_data_unlock(inode, 1);
out_fail:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
brelse(di_bh);
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
return ret;
}
@@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
- ocfs2_data_unlock(inode, 1);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
return ret;
}
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
+ .readpages = ocfs2_readpages,
.writepage = ocfs2_writepage,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f..f136639f5b4 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- brelse(bh);
+ put_bh(bh);
}
mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
- brelse(bh);
+ put_bh(bh);
bhs[i] = NULL;
continue;
}
@@ -280,3 +280,64 @@ bail:
mlog_exit(status);
return status;
}
+
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+ sector_t blkno)
+{
+ int i;
+ u64 backup_blkno;
+
+ if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+ return;
+
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+ backup_blkno = ocfs2_backup_super_blkno(sb, i);
+ if (backup_blkno == blkno)
+ return;
+ }
+
+ BUG();
+}
+
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+ struct buffer_head *bh)
+{
+ int ret = 0;
+
+ mlog_entry_void();
+
+ BUG_ON(buffer_jbd(bh));
+ ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ lock_buffer(bh);
+ set_buffer_uptodate(bh);
+
+ /* remove from dirty list before I/O. */
+ clear_buffer_dirty(bh);
+
+ get_bh(bh); /* for end_buffer_write_sync() */
+ bh->b_end_io = end_buffer_write_sync;
+ submit_bh(WRITE, bh);
+
+ wait_on_buffer(bh);
+
+ if (!buffer_uptodate(bh)) {
+ ret = -EIO;
+ put_bh(bh);
+ }
+
+out:
+ mlog_exit(ret);
+ return ret;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac..c2e78614c3e 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
int flags,
struct inode *inode);
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+ struct buffer_head *bh);
#define OCFS2_BH_CACHED 1
#define OCFS2_BH_READAHEAD 8
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecd..e511339886b 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
extern unsigned int o2hb_dead_threshold;
-#define O2HB_DEFAULT_DEAD_THRESHOLD 7
+#define O2HB_DEFAULT_DEAD_THRESHOLD 31
/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
#define O2HB_MIN_DEAD_THRESHOLD 2
#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df94..23c732f2752 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
.kobj = {.ktype = &mlog_ktype},
};
-int mlog_sys_init(struct kset *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_kset)
{
int i = 0;
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
mlog_attr_ptrs[i] = NULL;
kobject_set_name(&mlog_kset.kobj, "logmask");
- kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
+ mlog_kset.kobj.kset = o2cb_kset;
return kset_register(&mlog_kset);
}
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd0..a4b07730b2e 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/fs.h>
#include "ocfs2_nodemanager.h"
#include "masklog.h"
#include "sys.h"
-struct o2cb_attribute {
- struct attribute attr;
- ssize_t (*show)(char *buf);
- ssize_t (*store)(const char *buf, size_t count);
-};
-
-#define O2CB_ATTR(_name, _mode, _show, _store) \
-struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
-
-#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
-static ssize_t o2cb_interface_revision_show(char *buf)
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
-
-static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+static struct kobj_attribute attr_version =
+ __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
- &o2cb_attr_interface_revision.attr,
+ &attr_version.attr,
NULL,
};
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
- const char * buffer, size_t count);
-static struct sysfs_ops o2cb_sysfs_ops = {
- .show = o2cb_show,
- .store = o2cb_store,
+static struct attribute_group o2cb_attr_group = {
+ .attrs = o2cb_attrs,
};
-static struct kobj_type o2cb_subsys_type = {
- .default_attrs = o2cb_attrs,
- .sysfs_ops = &o2cb_sysfs_ops,
-};
-
-/* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL, NULL);
-
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
-{
- struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
- struct kset *sbs = to_kset(kobj);
-
- BUG_ON(sbs != &o2cb_subsys);
-
- if (o2cb_attr->show)
- return o2cb_attr->show(buffer);
- return -EIO;
-}
-
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
- const char * buffer, size_t count)
-{
- struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
- struct kset *sbs = to_kset(kobj);
-
- BUG_ON(sbs != &o2cb_subsys);
-
- if (o2cb_attr->store)
- return o2cb_attr->store(buffer, count);
- return -EIO;
-}
+static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
- subsystem_unregister(&o2cb_subsys);
+ kset_unregister(o2cb_kset);
}
int o2cb_sys_init(void)
{
int ret;
- o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
- ret = subsystem_register(&o2cb_subsys);
+ o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
+ if (!o2cb_kset)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
- return ret;
+ goto error;
- ret = mlog_sys_init(&o2cb_subsys);
+ ret = mlog_sys_init(o2cb_kset);
if (ret)
- subsystem_unregister(&o2cb_subsys);
+ goto error;
+ return 0;
+error:
+ kset_unregister(o2cb_kset);
return ret;
}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f..f36f66aab3d 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
/* same as hb delay, we're waiting for another node to recognize our hb */
#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
-#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000
-#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
/* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89..b2e832aca56 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
+ * New in version 10:
+ * - Meta/data locks combined
+ *
+ * New in version 9:
+ * - All votes removed
+ *
* New in version 8:
* - Replace delete inode votes with a cluster lock
*
@@ -60,7 +66,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 8ULL
+#define O2NET_PROTOCOL_VERSION 10ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30..a56eee6abad 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
#include "ver.h"
-#define CLUSTER_BUILD_VERSION "1.3.3"
+#define CLUSTER_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 9923278ea6d..b1cc7c381e8 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
/*
* Walk the inode alias list, and find a dentry which has a given
* parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
- * is looking for a dentry_lock reference. The vote thread is looking
- * to unhash aliases, so we allow it to skip any that already have
- * that property.
+ * is looking for a dentry_lock reference. The downconvert thread is
+ * looking to unhash aliases, so we allow it to skip any that already
+ * have that property.
*/
struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
dl->dl_count = 0;
/*
* Does this have to happen below, for all attaches, in case
- * the struct inode gets blown away by votes?
+ * the struct inode gets blown away by the downconvert thread?
*/
dl->dl_inode = igrab(inode);
dl->dl_parent_blkno = parent_blkno;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 63b28fdceb4..6b0107f2134 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
mlog_entry("dirino=%llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
* on commonly accessed directories. */
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
lock_level = 0;
- error = ocfs2_meta_lock(inode, NULL, 0);
+ error = ocfs2_inode_lock(inode, NULL, 0);
}
if (error < 0) {
if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
dirent, filldir, NULL);
- ocfs2_meta_unlock(inode, lock_level);
+ ocfs2_inode_unlock(inode, lock_level);
bail_nolock:
mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f..a733b3321f8 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
#include "dlmfsver.h"
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf9143..91f747b8a53 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
}
}
+ /* Clean up join state on node death. */
+ if (dlm->joining_node == idx) {
+ mlog(0, "Clearing join state for node %u\n", idx);
+ __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+ }
+
/* check to see if the node is already considered dead */
if (!test_bit(idx, dlm->live_nodes_map)) {
mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
clear_bit(idx, dlm->live_nodes_map);
- /* Clean up join state on node death. */
- if (dlm->joining_node == idx) {
- mlog(0, "Clearing join state for node %u\n", idx);
- __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- }
-
/* make sure local cleanup occurs before the heartbeat events */
if (!test_bit(idx, dlm->recovery_map))
dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
if (!dlm_grab(dlm))
return;
+ /*
+ * This will notify any dlm users that a node in our domain
+ * went away without notifying us first.
+ */
+ if (test_bit(idx, dlm->domain_map))
+ dlm_fire_domain_eviction_callbacks(dlm, idx);
+
spin_lock(&dlm->spinlock);
__dlm_hb_node_down(dlm, idx);
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f4..dfc0da4d158 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
#include "dlmver.h"
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8..3867244fb14 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
#include "slot_map.h"
#include "super.h"
#include "uptodate.h"
-#include "vote.h"
#include "buffer_head_io.h"
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
/*
* Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
/*
- * Optionally called in the downconvert (or "vote") thread
- * after a successful downconvert. The lockres will not be
- * referenced after this callback is called, so it is safe to
- * free memory, etc.
+ * Optionally called in the downconvert thread after a
+ * successful downconvert. The lockres will not be referenced
+ * after this callback is called, so it is safe to free
+ * memory, etc.
*
* The exact semantics of when this is called are controlled
* by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.get_osb = ocfs2_get_inode_osb,
.check_downconvert = ocfs2_check_meta_downconvert,
.set_lvb = ocfs2_set_meta_lvb,
- .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
-};
-
-static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
- .get_osb = ocfs2_get_inode_osb,
.downconvert_worker = ocfs2_data_convert_worker,
- .flags = 0,
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+ .get_osb = ocfs2_get_file_osb,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
- lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
lockres->l_type == OCFS2_LOCK_TYPE_RW ||
lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
"resource %s: %s\n", dlm_errname(_stat), _func, \
_lockres->l_name, dlm_errmsg(_stat)); \
} while (0)
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres);
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_downconvert_thread(void *arg);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh);
static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
static inline int ocfs2_highest_compat_lock_level(int level);
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int new_level,
+ int lvb);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
ops = &ocfs2_inode_rw_lops;
break;
case OCFS2_LOCK_TYPE_META:
- ops = &ocfs2_inode_meta_lops;
- break;
- case OCFS2_LOCK_TYPE_DATA:
- ops = &ocfs2_inode_data_lops;
+ ops = &ocfs2_inode_inode_lops;
break;
case OCFS2_LOCK_TYPE_OPEN:
ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
return OCFS2_SB(inode->i_sb);
}
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_file_private *fp = lockres->l_priv;
+
+ return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
+
static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
{
__be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_rename_lops, osb);
}
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_file_private *fp)
+{
+ struct inode *inode = fp->fp_file->f_mapping->host;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+ inode->i_generation, lockres->l_name);
+ ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+ OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+ fp);
+ lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
+
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
mlog_entry_void();
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
lockres->l_name, level, lockres->l_level,
ocfs2_lock_type_string(lockres->l_type));
+ /*
+ * We can skip the bast for locks which don't enable caching -
+ * they'll be dropped at the earliest possible time anyway.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+ return;
+
spin_lock_irqsave(&lockres->l_lock, flags);
needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
wake_up(&lockres->l_event);
- ocfs2_kick_vote_thread(osb);
+ ocfs2_wake_downconvert_thread(osb);
}
static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
}
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+ struct ocfs2_lock_res *lockres)
+{
+ int ret;
+
+ ret = wait_for_completion_interruptible(&mw->mw_complete);
+ if (ret)
+ lockres_remove_mask_waiter(lockres, mw);
+ else
+ ret = mw->mw_status;
+ /* Re-arm the completion in case we want to wait on it again */
+ INIT_COMPLETION(mw->mw_complete);
+ return ret;
+}
+
static int ocfs2_cluster_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
@@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
ocfs2_dec_holders(lockres, level);
- ocfs2_vote_on_unlock(osb, lockres);
+ ocfs2_downconvert_on_unlock(osb, lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
mlog_exit_void();
}
@@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
* We don't want to use LKM_LOCAL on a meta data lock as they
* don't use a generation in their lock names.
*/
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
if (ret) {
mlog_errno(ret);
goto bail;
@@ -1311,76 +1357,221 @@ out:
mlog_exit_void();
}
-int ocfs2_data_lock_full(struct inode *inode,
- int write,
- int arg_flags)
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
+ int level)
{
- int status = 0, level;
- struct ocfs2_lock_res *lockres;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
+ struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+ unsigned long flags;
+ struct ocfs2_mask_waiter mw;
- BUG_ON(!inode);
+ ocfs2_init_mask_waiter(&mw);
- mlog_entry_void();
+retry_cancel:
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+ ret = ocfs2_prepare_cancel_convert(osb, lockres);
+ if (ret) {
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ ret = ocfs2_cancel_convert(osb, lockres);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ goto retry_cancel;
+ }
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
- mlog(0, "inode %llu take %s DATA lock\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- write ? "EXMODE" : "PRMODE");
+ ocfs2_wait_for_mask(&mw);
+ goto retry_cancel;
+ }
- /* We'll allow faking a readonly data lock for
- * rodevices. */
- if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
- if (write) {
- status = -EROFS;
- mlog_errno(status);
+ ret = -ERESTARTSYS;
+ /*
+ * We may still have gotten the lock, in which case there's no
+ * point to restarting the syscall.
+ */
+ if (lockres->l_level == level)
+ ret = 0;
+
+ mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+ lockres->l_flags, lockres->l_level, lockres->l_action);
+
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * seperate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ * what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ * no-lock at unlock time. This also means flock locks never go on
+ * the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ * sure to allow cancellation of a misbehaving applications flock()
+ * request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ * can simplify the code by requiring the caller to guarantee
+ * serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+ int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+ unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+ unsigned long flags;
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_lock_res *lockres = &fp->fp_flock;
+ struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+ struct ocfs2_mask_waiter mw;
+
+ ocfs2_init_mask_waiter(&mw);
+
+ if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+ (lockres->l_level > LKM_NLMODE)) {
+ mlog(ML_ERROR,
+ "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+ "level: %u\n", lockres->l_name, lockres->l_flags,
+ lockres->l_level);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+ /*
+ * Get the lock at NLMODE to start - that way we
+ * can cancel the upconvert request if need be.
+ */
+ ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
- goto out;
+
+ ret = ocfs2_wait_for_mask(&mw);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ spin_lock_irqsave(&lockres->l_lock, flags);
}
- if (ocfs2_mount_local(osb))
- goto out;
+ lockres->l_action = OCFS2_AST_CONVERT;
+ lkm_flags |= LKM_CONVERT;
+ lockres->l_requested = level;
+ lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
- lockres = &OCFS2_I(inode)->ip_data_lockres;
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
- level = write ? LKM_EXMODE : LKM_PRMODE;
+ ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+ lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+ ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+ if (ret != DLM_NORMAL) {
+ if (trylock && ret == DLM_NOTQUEUED)
+ ret = -EAGAIN;
+ else {
+ ocfs2_log_dlm_error("dlmlock", ret, lockres);
+ ret = -EINVAL;
+ }
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
- 0, arg_flags);
- if (status < 0 && status != -EAGAIN)
- mlog_errno(status);
+ ocfs2_recover_from_dlm_error(lockres, 1);
+ lockres_remove_mask_waiter(lockres, &mw);
+ goto out;
+ }
+
+ ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+ if (ret == -ERESTARTSYS) {
+ /*
+ * Userspace can cause deadlock itself with
+ * flock(). Current behavior locally is to allow the
+ * deadlock, but abort the system call if a signal is
+ * received. We follow this example, otherwise a
+ * poorly written program could sit in kernel until
+ * reboot.
+ *
+ * Handling this is a bit more complicated for Ocfs2
+ * though. We can't exit this function with an
+ * outstanding lock request, so a cancel convert is
+ * required. We intentionally overwrite 'ret' - if the
+ * cancel fails and the lock was granted, it's easier
+ * to just bubble sucess back up to the user.
+ */
+ ret = ocfs2_flock_handle_signal(lockres, level);
+ }
out:
- mlog_exit(status);
- return status;
+
+ mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+ lockres->l_name, ex, trylock, ret);
+ return ret;
}
-/* see ocfs2_meta_lock_with_page() */
-int ocfs2_data_lock_with_page(struct inode *inode,
- int write,
- struct page *page)
+void ocfs2_file_unlock(struct file *file)
{
int ret;
+ unsigned long flags;
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_lock_res *lockres = &fp->fp_flock;
+ struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+ struct ocfs2_mask_waiter mw;
- ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
- if (ret == -EAGAIN) {
- unlock_page(page);
- if (ocfs2_data_lock(inode, write) == 0)
- ocfs2_data_unlock(inode, write);
- ret = AOP_TRUNCATED_PAGE;
+ ocfs2_init_mask_waiter(&mw);
+
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
+ return;
+
+ if (lockres->l_level == LKM_NLMODE)
+ return;
+
+ mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+ lockres->l_name, lockres->l_flags, lockres->l_level,
+ lockres->l_action);
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ /*
+ * Fake a blocking ast for the downconvert code.
+ */
+ lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+ lockres->l_blocking = LKM_EXMODE;
+
+ ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+ ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+ if (ret) {
+ mlog_errno(ret);
+ return;
}
- return ret;
+ ret = ocfs2_wait_for_mask(&mw);
+ if (ret)
+ mlog_errno(ret);
}
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres)
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int kick = 0;
mlog_entry_void();
/* If we know that another node is waiting on our lock, kick
- * the vote thread * pre-emptively when we reach a release
+ * the downconvert thread * pre-emptively when we reach a release
* condition. */
if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
switch(lockres->l_blocking) {
@@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
}
if (kick)
- ocfs2_kick_vote_thread(osb);
-
- mlog_exit_void();
-}
-
-void ocfs2_data_unlock(struct inode *inode,
- int write)
-{
- int level = write ? LKM_EXMODE : LKM_PRMODE;
- struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
- mlog_entry_void();
-
- mlog(0, "inode %llu drop %s DATA lock\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- write ? "EXMODE" : "PRMODE");
-
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
- !ocfs2_mount_local(osb))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ ocfs2_wake_downconvert_thread(osb);
mlog_exit_void();
}
@@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
/* Call this with the lockres locked. I am reasonably sure we don't
* need ip_lock in this function as anyone who would be changing those
- * values is supposed to be blocked in ocfs2_meta_lock right now. */
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
static void __ocfs2_stuff_meta_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+ struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
mlog_entry_void();
@@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+ struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
mlog_entry_void();
@@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
}
/* may or may not return a bh if it went to disk. */
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh)
{
int status = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+ struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
* returns < 0 error if the callback will never be called, otherwise
* the result of the lock will be communicated via the callback.
*/
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags)
@@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
wait_event(osb->recovery_event,
ocfs2_node_map_is_empty(osb, &osb->recovery_map));
- lockres = &OCFS2_I(inode)->ip_meta_lockres;
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
level = ex ? LKM_EXMODE : LKM_PRMODE;
dlm_flags = 0;
if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1795,11 +1966,11 @@ local:
}
/* This is fun. The caller may want a bh back, or it may
- * not. ocfs2_meta_lock_update definitely wants one in, but
+ * not. ocfs2_inode_lock_update definitely wants one in, but
* may or may not read one, depending on what's in the
* LVB. The result of all of this is that we've *only* gone to
* disk if we have to, so the complexity is worthwhile. */
- status = ocfs2_meta_lock_update(inode, &local_bh);
+ status = ocfs2_inode_lock_update(inode, &local_bh);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1821,7 +1992,7 @@ bail:
*ret_bh = NULL;
}
if (acquired)
- ocfs2_meta_unlock(inode, ex);
+ ocfs2_inode_unlock(inode, ex);
}
if (local_bh)
@@ -1832,19 +2003,20 @@ bail:
}
/*
- * This is working around a lock inversion between tasks acquiring DLM locks
- * while holding a page lock and the vote thread which blocks dlm lock acquiry
- * while acquiring page locks.
+ * This is working around a lock inversion between tasks acquiring DLM
+ * locks while holding a page lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring page locks.
*
* ** These _with_page variantes are only intended to be called from aop
* methods that hold page locks and return a very specific *positive* error
* code that aop methods pass up to the VFS -- test for errors with != 0. **
*
- * The DLM is called such that it returns -EAGAIN if it would have blocked
- * waiting for the vote thread. In that case we unlock our page so the vote
- * thread can make progress. Once we've done this we have to return
- * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
- * into the VFS who will then immediately retry the aop call.
+ * The DLM is called such that it returns -EAGAIN if it would have
+ * blocked waiting for the downconvert thread. In that case we unlock
+ * our page so the downconvert thread can make progress. Once we've
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
*
* We do a blocking lock and immediate unlock before returning, though, so that
* the lock has a great chance of being cached on this node by the time the VFS
@@ -1852,32 +2024,32 @@ bail:
* ping locks back and forth, but that's a risk we're willing to take to avoid
* the lock inversion simply.
*/
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page)
{
int ret;
- ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
+ ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
- if (ocfs2_meta_lock(inode, ret_bh, ex) == 0)
- ocfs2_meta_unlock(inode, ex);
+ if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+ ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
return ret;
}
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level)
{
int ret;
mlog_entry_void();
- ret = ocfs2_meta_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
if (ocfs2_should_update_atime(inode, vfsmnt)) {
struct buffer_head *bh = NULL;
- ocfs2_meta_unlock(inode, 0);
- ret = ocfs2_meta_lock(inode, &bh, 1);
+ ocfs2_inode_unlock(inode, 0);
+ ret = ocfs2_inode_lock(inode, &bh, 1);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
return ret;
}
-void ocfs2_meta_unlock(struct inode *inode,
+void ocfs2_inode_unlock(struct inode *inode,
int ex)
{
int level = ex ? LKM_EXMODE : LKM_PRMODE;
- struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry_void();
@@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- /* launch vote thread */
- osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
- if (IS_ERR(osb->vote_task)) {
- status = PTR_ERR(osb->vote_task);
- osb->vote_task = NULL;
+ /* launch downconvert thread */
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ if (IS_ERR(osb->dc_task)) {
+ status = PTR_ERR(osb->dc_task);
+ osb->dc_task = NULL;
mlog_errno(status);
goto bail;
}
@@ -2353,8 +2525,8 @@ local:
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
- if (osb->vote_task)
- kthread_stop(osb->vote_task);
+ if (osb->dc_task)
+ kthread_stop(osb->dc_task);
}
mlog_exit(status);
@@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
ocfs2_drop_osb_locks(osb);
- if (osb->vote_task) {
- kthread_stop(osb->vote_task);
- osb->vote_task = NULL;
+ if (osb->dc_task) {
+ kthread_stop(osb->dc_task);
+ osb->dc_task = NULL;
}
ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2699,7 @@ out:
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
- * being dequeued from the vote thread before we can consider
+ * being dequeued from the downconvert thread before we can consider
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_data_lockres);
- if (err < 0)
- mlog_errno(err);
- if (err < 0 && !status)
- status = err;
-
- err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_meta_lockres);
+ &OCFS2_I(inode)->ip_inode_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
@@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
+ if (S_ISREG(inode->i_mode))
+ goto out;
+
/*
* We need this before the filemap_fdatawrite() so that it can
* transfer the dirty bit from the PTE to the
@@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+out:
return UNBLOCK_CONTINUE;
}
@@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
/*
* Does the final reference drop on our dentry lock. Right now this
- * happens in the vote thread, but we could choose to simplify the
+ * happens in the downconvert thread, but we could choose to simplify the
* dlmglue API and push these off to the ocfs2_wq in the future.
*/
static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
mlog(0, "lockres %s blocked.\n", lockres->l_name);
/* Detect whether a lock has been marked as going away while
- * the vote thread was processing other things. A lock can
+ * the downconvert thread was processing other things. A lock can
* still be marked with OCFS2_LOCK_FREEING after this check,
* but short circuiting here will still save us some
* performance. */
@@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
- spin_lock(&osb->vote_task_lock);
+ spin_lock(&osb->dc_task_lock);
if (list_empty(&lockres->l_blocked_list)) {
list_add_tail(&lockres->l_blocked_list,
&osb->blocked_lock_list);
osb->blocked_lock_count++;
}
- spin_unlock(&osb->vote_task_lock);
+ spin_unlock(&osb->dc_task_lock);
+
+ mlog_exit_void();
+}
+
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+ unsigned long processed;
+ struct ocfs2_lock_res *lockres;
+
+ mlog_entry_void();
+
+ spin_lock(&osb->dc_task_lock);
+ /* grab this early so we know to try again if a state change and
+ * wake happens part-way through our work */
+ osb->dc_work_sequence = osb->dc_wake_sequence;
+
+ processed = osb->blocked_lock_count;
+ while (processed) {
+ BUG_ON(list_empty(&osb->blocked_lock_list));
+
+ lockres = list_entry(osb->blocked_lock_list.next,
+ struct ocfs2_lock_res, l_blocked_list);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock(&osb->dc_task_lock);
+
+ BUG_ON(!processed);
+ processed--;
+
+ ocfs2_process_blocked_lock(osb, lockres);
+
+ spin_lock(&osb->dc_task_lock);
+ }
+ spin_unlock(&osb->dc_task_lock);
mlog_exit_void();
}
+
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
+{
+ int empty = 0;
+
+ spin_lock(&osb->dc_task_lock);
+ if (list_empty(&osb->blocked_lock_list))
+ empty = 1;
+
+ spin_unlock(&osb->dc_task_lock);
+ return empty;
+}
+
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+ int should_wake = 0;
+
+ spin_lock(&osb->dc_task_lock);
+ if (osb->dc_work_sequence != osb->dc_wake_sequence)
+ should_wake = 1;
+ spin_unlock(&osb->dc_task_lock);
+
+ return should_wake;
+}
+
+int ocfs2_downconvert_thread(void *arg)
+{
+ int status = 0;
+ struct ocfs2_super *osb = arg;
+
+ /* only quit once we've been asked to stop and there is no more
+ * work available */
+ while (!(kthread_should_stop() &&
+ ocfs2_downconvert_thread_lists_empty(osb))) {
+
+ wait_event_interruptible(osb->dc_event,
+ ocfs2_downconvert_thread_should_wake(osb) ||
+ kthread_should_stop());
+
+ mlog(0, "downconvert_thread: awoken\n");
+
+ ocfs2_downconvert_thread_do_work(osb);
+ }
+
+ osb->dc_task = NULL;
+ return status;
+}
+
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+ spin_lock(&osb->dc_task_lock);
+ /* make sure the voting thread gets a swipe at whatever changes
+ * the caller may have made to the voting state */
+ osb->dc_wake_sequence++;
+ spin_unlock(&osb->dc_task_lock);
+ wake_up(&osb->dc_event);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e4120..5f17243ba50 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
__be32 lvb_reserved2;
};
-/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
/* Instruct the dlm not to queue ourselves on the other node. */
#define OCFS2_META_LOCK_NOQUEUE (0x02)
-/* don't block waiting for the vote thread, instead return -EAGAIN */
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
struct inode *inode);
void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_file_private *fp);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
-int ocfs2_data_lock_full(struct inode *inode,
- int write,
- int arg_flags);
-#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
-int ocfs2_data_lock_with_page(struct inode *inode,
- int write,
- struct page *page);
-void ocfs2_data_unlock(struct inode *inode,
- int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags);
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page);
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
-#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0)
-void ocfs2_meta_unlock(struct inode *inode,
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
+void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
-/* for the vote thread */
+/* for the downconvert thread */
void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af1..1942e09f6ee 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
*var = cpu_to_le64(le64_to_cpu(*var) + val);
}
-static inline void le32_and_cpu(__le32 *var, u32 val)
-{
- *var = cpu_to_le32(le32_to_cpu(*var) & val);
-}
-
static inline void be32_add_cpu(__be32 *var, u32 val)
{
*var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a..67527cebf21 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE);
}
- inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
+ inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
if (IS_ERR(inode))
return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
mlog(0, "find parent of directory %llu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno);
- status = ocfs2_meta_lock(dir, NULL, 0);
+ status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
goto bail_unlock;
}
- inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+ inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
if (IS_ERR(inode)) {
mlog(ML_ERROR, "Unable to create inode %llu\n",
(unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
parent->d_op = &ocfs2_dentry_ops;
bail_unlock:
- ocfs2_meta_unlock(dir, 0);
+ ocfs2_inode_unlock(dir, 0);
bail:
mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b75b2e1f0e4..ed5d5232e85 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
#include "inode.h"
#include "ioctl.h"
#include "journal.h"
+#include "locks.h"
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
return sync_mapping_buffers(inode->i_mapping);
}
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+ struct ocfs2_file_private *fp;
+
+ fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->fp_file = file;
+ mutex_init(&fp->fp_mutex);
+ ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+ file->private_data = fp;
+
+ return 0;
+}
+
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (fp) {
+ ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+ ocfs2_lock_res_free(&fp->fp_flock);
+ kfree(fp);
+ file->private_data = NULL;
+ }
+}
+
static int ocfs2_file_open(struct inode *inode, struct file *file)
{
int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
oi->ip_open_count++;
spin_unlock(&oi->ip_lock);
- status = 0;
+
+ status = ocfs2_init_file_private(inode, file);
+ if (status) {
+ /*
+ * We want to set open count back if we're failing the
+ * open.
+ */
+ spin_lock(&oi->ip_lock);
+ oi->ip_open_count--;
+ spin_unlock(&oi->ip_lock);
+ }
+
leave:
mlog_exit(status);
return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
spin_unlock(&oi->ip_lock);
+ ocfs2_free_file_private(inode, file);
+
mlog_exit(0);
return 0;
}
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+ return ocfs2_init_file_private(inode, file);
+}
+
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+ ocfs2_free_file_private(inode, file);
+ return 0;
+}
+
static int ocfs2_sync_file(struct file *file,
struct dentry *dentry,
int datasync)
@@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode,
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- /* This forces other nodes to sync and drop their pages. Do
- * this even if we have a truncate without allocation change -
- * ocfs2 cluster sizes can be much greater than page size, so
- * we have to truncate them anyway. */
- status = ocfs2_data_lock(inode, 1);
- if (status < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- mlog_errno(status);
- goto bail;
- }
-
+ /*
+ * The inode lock forced other nodes to sync and drop their
+ * pages, which (correctly) happens even if we have a truncate
+ * without allocation change - ocfs2 cluster sizes can be much
+ * greater than page size, so we have to truncate them
+ * anyway.
+ */
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
truncate_inode_pages(inode->i_mapping, new_i_size);
@@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode,
if (status)
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
/* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
/* TODO: orphan dir cleanup here. */
-bail_unlock_data:
- ocfs2_data_unlock(inode, 1);
-
+bail_unlock_sem:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
"clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
le32_to_cpu(fe->i_clusters),
(unsigned long long)le64_to_cpu(fe->i_size));
mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
- OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+ OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
leave:
if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
{
- int ret = 0, data_locked = 0;
+ int ret = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
&& ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
goto out_update_size;
- /*
- * protect the pages that ocfs2_zero_extend is going to be
- * pulling into the page cache.. we do this before the
- * metadata extend so that we don't get into the situation
- * where we've extended the metadata but can't get the data
- * lock to zero.
- */
- ret = ocfs2_data_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- data_locked = 1;
-
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
up_write(&oi->ip_alloc_sem);
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
}
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
if (ret < 0) {
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
if (ret < 0)
mlog_errno(ret);
-out_unlock:
- if (data_locked)
- ocfs2_data_unlock(inode, 1);
-
out:
return ret;
}
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- status = ocfs2_meta_lock(inode, &bh, 1);
+ status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
mlog_entry_void();
- ret = ocfs2_meta_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret) {
if (ret != -ENOENT)
mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
ret = generic_permission(inode, mask, NULL);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
out:
mlog_exit(ret);
return ret;
@@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out;
}
- ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_rw_unlock;
@@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
ret = -EPERM;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
switch (sr->l_whence) {
@@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
break;
default:
ret = -EINVAL;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
sr->l_whence = 0;
@@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
|| (sr->l_start + llen) < 0
|| (sr->l_start + llen) > max_off) {
ret = -EINVAL;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
size = sr->l_start + sr->l_len;
if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
}
@@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
}
@@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
/*
@@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
if (change_size && i_size_read(inode) < size)
@@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
ocfs2_commit_trans(osb, handle);
-out_meta_unlock:
+out_inode_unlock:
brelse(di_bh);
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
@@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
* if we need to make modifications here.
*/
for(;;) {
- ret = ocfs2_meta_lock(inode, NULL, meta_level);
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
if (ret < 0) {
meta_level = -1;
mlog_errno(ret);
@@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_meta_unlock(inode, meta_level);
+ ocfs2_inode_unlock(inode, meta_level);
meta_level = 1;
continue;
}
@@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
*ppos = saved_pos;
out_unlock:
- ocfs2_meta_unlock(inode, meta_level);
+ ocfs2_inode_unlock(inode, meta_level);
out:
return ret;
@@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
/*
* See the comment in ocfs2_file_aio_read()
*/
- ret = ocfs2_meta_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
ret = generic_file_splice_read(in, ppos, pipe, len, flags);
@@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* like i_size. This allows the checks down below
* generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
- ocfs2_meta_unlock(inode, lock_level);
+ ocfs2_inode_unlock(inode, lock_level);
ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
if (ret == -EINVAL)
@@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
};
const struct file_operations ocfs2_fops = {
+ .llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
@@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
};
const struct file_operations ocfs2_dops = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = ocfs2_readdir,
.fsync = ocfs2_sync_file,
+ .release = ocfs2_dir_release,
+ .open = ocfs2_dir_open,
.ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .flock = ocfs2_flock,
};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a..048ddcaf5c8 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
extern const struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
+struct ocfs2_file_private {
+ struct file *fp_file;
+ struct mutex fp_mutex;
+ struct ocfs2_lock_res fp_flock;
+};
+
enum ocfs2_alloc_restarted {
RESTART_NONE = 0,
RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240..c0efd9489fe 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
#include <linux/highmem.h>
#include <linux/kmod.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-
#include <dlm/dlmapi.h>
#define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
-#include "vote.h"
#include "buffer_head_io.h"
-#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
-#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
-
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
- ocfs2_node_map_init(&osb->mounted_map);
ocfs2_node_map_init(&osb->recovery_map);
- ocfs2_node_map_init(&osb->umount_map);
ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
}
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
return;
}
- if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
- /* If a node is in the umount map, then we've been
- * expecting him to go down and we know ahead of time
- * that recovery is not necessary. */
- ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
- return;
- }
-
ocfs2_recovery_thread(osb, node_num);
-
- ocfs2_remove_node_from_vote_queues(osb, node_num);
-}
-
-static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
- int node_num,
- void *data)
-{
- ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
}
/* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
ocfs2_do_node_down(node_num, osb);
}
-static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
- int node_num,
- void *data)
-{
- struct ocfs2_super *osb = data;
-
- BUG_ON(osb->node_num == node_num);
-
- mlog(0, "node up event for %d\n", node_num);
- ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
{
- o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
- ocfs2_hb_node_down_cb, osb,
- OCFS2_HB_NODE_DOWN_PRI);
-
- o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
- ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
-
/* Not exactly a heartbeat callback, but leads to essentially
* the same path so we set it up here. */
dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
osb);
}
-/* Most functions here are just stubs for now... */
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
-{
- int status;
-
- if (ocfs2_mount_local(osb))
- return 0;
-
- status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
- if (status < 0) {
- mlog_errno(status);
- o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
- }
-
-bail:
- return status;
-}
-
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
-{
- if (ocfs2_mount_local(osb))
- return;
-
- o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
- o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
-}
-
void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
{
int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
-
if (!test_bit(num, osb->recovery_map.map)) {
__ocfs2_node_map_set_bit(&osb->recovery_map, num);
set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e..56859211888 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
void ocfs2_init_node_maps(struct ocfs2_super *osb);
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
/* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ebb2bbe30f3..7e9e4c79aec 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "vote.h"
#include "buffer_head_io.h"
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
u64 fi_blkno;
unsigned long fi_ino;
unsigned int fi_flags;
+ unsigned int fi_sysfile_type;
};
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
+
static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_find_inode_args *args);
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
oi->ip_attr |= OCFS2_DIRSYNC_FL;
}
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+ int sysfile_type)
{
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
args.fi_blkno = blkno;
args.fi_flags = flags;
args.fi_ino = ino_from_blkno(sb, blkno);
+ args.fi_sysfile_type = sysfile_type;
inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+ if (args->fi_sysfile_type != 0)
+ lockdep_set_class(&inode->i_mutex,
+ &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
mlog_exit(0);
return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
*/
BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
- ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_LOCK_TYPE_RW, inode->i_generation,
inode);
- ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
- OCFS2_LOCK_TYPE_DATA, inode->i_generation,
- inode);
-
ocfs2_set_inode_flags(inode);
status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
generation = osb->fs_generation;
- ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
OCFS2_LOCK_TYPE_META,
generation, inode);
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
mlog_errno(status);
return status;
}
- status = ocfs2_meta_lock(inode, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
bail:
if (can_lock)
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
if (status < 0)
make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
mutex_lock(&inode_alloc_inode->i_mutex);
- status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1);
+ status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
if (status < 0) {
mutex_unlock(&inode_alloc_inode->i_mutex);
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
- le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+ di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
status = ocfs2_journal_dirty(handle, di_bh);
if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_meta_unlock(inode_alloc_inode, 1);
+ ocfs2_inode_unlock(inode_alloc_inode, 1);
mutex_unlock(&inode_alloc_inode->i_mutex);
brelse(inode_alloc_bh);
bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
* delete_inode operation. We do this now to avoid races with
* recovery completion on other nodes. */
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mutex_unlock(&orphan_dir_inode->i_mutex);
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
}
/* we do this while holding the orphan dir lock because we
- * don't want recovery being run from another node to vote for
- * an inode delete on us -- this will result in two nodes
+ * don't want recovery being run from another node to try an
+ * inode delete underneath us -- this will result in two nodes
* truncating the same file! */
status = ocfs2_truncate_for_delete(osb, inode, di_bh);
if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
mlog_errno(status);
bail_unlock_dir:
- ocfs2_meta_unlock(orphan_dir_inode, 1);
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
brelse(orphan_dir_bh);
bail:
@@ -744,7 +747,7 @@ bail:
}
/* There is a series of simple checks that should be done before a
- * vote is even considered. Encapsulate those in this function. */
+ * trylock is even considered. Encapsulate those in this function. */
static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
{
int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
- /* If we're coming from process_vote we can't go into our own
+ /* If we're coming from downconvert_thread we can't go into our own
* voting [hello, deadlock city!], so unforuntately we just
* have to skip deleting this guy. That's OK though because
* the node who's doing the actual deleting should handle it
* anyway. */
- if (current == osb->vote_task) {
+ if (current == osb->dc_task) {
mlog(0, "Skipping delete of %lu because we're currently "
- "in process_vote\n", inode->i_ino);
+ "in downconvert\n", inode->i_ino);
goto bail;
}
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
- /* If we have voted "yes" on the wipe of this inode for
- * another node, it will be marked here so we can safely skip
- * it. Recovery will cleanup any inodes we might inadvertantly
- * skip here. */
+ /* If we have allowd wipe of this inode for another node, it
+ * will be marked here so we can safely skip it. Recovery will
+ * cleanup any inodes we might inadvertantly skip here. */
if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
mlog(0, "Skipping delete of %lu because another node "
"has done this for us.\n", inode->i_ino);
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
/* Lock down the inode. This gives us an up to date view of
* it's metadata (for verification), and allows us to
- * serialize delete_inode votes.
+ * serialize delete_inode on multiple nodes.
*
* Even though we might be doing a truncate, we don't take the
* allocation lock here as it won't be needed - nobody will
* have the file open.
*/
- status = ocfs2_meta_lock(inode, &di_bh, 1);
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
if (!wipe || status < 0) {
- /* Error and inode busy vote both mean we won't be
+ /* Error and remote inode busy both mean we won't be
* removing the inode, so they take almost the same
* path. */
if (status < 0)
mlog_errno(status);
- /* Someone in the cluster has voted to not wipe this
- * inode, or it was never completely orphaned. Write
- * out the pages and exit now. */
+ /* Someone in the cluster has disallowed a wipe of
+ * this inode, or it was never completely
+ * orphaned. Write out the pages and exit now. */
ocfs2_cleanup_delete_inode(inode, 1);
goto bail_unlock_inode;
}
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
bail_unlock_inode:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
bail_unblock:
status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
- /* For remove delete_inode vote, we hold open lock before,
- * now it is time to unlock PR and EX open locks. */
+ /* To preven remote deletes we hold open lock before, now it
+ * is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
/* Do these before all the other work so that we don't bounce
- * the vote thread while waiting to destroy the locks. */
+ * the downconvert thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+ ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_errno(status);
ocfs2_lock_res_free(&oi->ip_rw_lockres);
- ocfs2_lock_res_free(&oi->ip_meta_lockres);
- ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_lock_res_free(&oi->ip_inode_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
- /* Let ocfs2_meta_lock do the work of updating our struct
+ /* Let ocfs2_inode_lock do the work of updating our struct
* inode for us. */
- status = ocfs2_meta_lock(inode, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
bail:
mlog_exit(status);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c5553..390a85596aa 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
u64 ip_blkno;
struct ocfs2_lock_res ip_rw_lockres;
- struct ocfs2_lock_res ip_meta_lockres;
- struct ocfs2_lock_res ip_data_lockres;
+ struct ocfs2_lock_res ip_inode_lockres;
struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_SYSFILE 0x4
-#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
+#define OCFS2_FI_FLAG_SYSFILE 0x1
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+ int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b..5177fba5162 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
#include "ocfs2_fs.h"
#include "ioctl.h"
+#include "resize.h"
#include <linux/ext2_fs.h>
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
int status;
- status = ocfs2_meta_lock(inode, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
return status;
}
ocfs2_get_inode_flags(OCFS2_I(inode));
*flags = OCFS2_I(inode)->ip_attr;
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
mlog_exit(status);
return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
mutex_lock(&inode->i_mutex);
- status = ocfs2_meta_lock(inode, &bh, 1);
+ status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail:
mutex_unlock(&inode->i_mutex);
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
unsigned int cmd, unsigned long arg)
{
unsigned int flags;
+ int new_clusters;
int status;
struct ocfs2_space_resv sr;
+ struct ocfs2_new_group_input input;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
return -EFAULT;
return ocfs2_change_file_space(filp, cmd, &sr);
+ case OCFS2_IOC_GROUP_EXTEND:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (get_user(new_clusters, (int __user *)arg))
+ return -EFAULT;
+
+ return ocfs2_group_extend(inode, new_clusters);
+ case OCFS2_IOC_GROUP_ADD:
+ case OCFS2_IOC_GROUP_ADD64:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ return ocfs2_group_add(inode, &input);
default:
return -ENOTTY;
}
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
case OCFS2_IOC_UNRESVSP64:
+ case OCFS2_IOC_GROUP_EXTEND:
+ case OCFS2_IOC_GROUP_ADD:
+ case OCFS2_IOC_GROUP_ADD64:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8d81f6c1b87..f31c7e8c19c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
#include "localalloc.h"
#include "slot_map.h"
#include "super.h"
-#include "vote.h"
#include "sysfile.h"
#include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
journal->j_trans_id, flushed);
- ocfs2_kick_vote_thread(osb);
+ ocfs2_wake_downconvert_thread(osb);
wake_up(&journal->j_checkpointed);
finally:
mlog_exit(status);
@@ -314,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
return err;
}
-#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
{
journal_t *journal = osb->journal->j_journal;
+ unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+
+ if (osb->osb_commit_interval)
+ commit_interval = osb->osb_commit_interval;
spin_lock(&journal->j_state_lock);
- journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+ journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
journal->j_flags |= JFS_BARRIER;
else
@@ -337,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
struct ocfs2_super *osb;
- int meta_lock = 0;
+ int inode_lock = 0;
mlog_entry_void();
@@ -367,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
/* Skip recovery waits here - journal inode metadata never
* changes in a live cluster so it can be considered an
* exception to the rule. */
- status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+ status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not get lock on journal!\n");
goto done;
}
- meta_lock = 1;
+ inode_lock = 1;
di = (struct ocfs2_dinode *)bh->b_data;
if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@@ -414,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
status = 0;
done:
if (status < 0) {
- if (meta_lock)
- ocfs2_meta_unlock(inode, 1);
+ if (inode_lock)
+ ocfs2_inode_unlock(inode, 1);
if (bh != NULL)
brelse(bh);
if (inode) {
@@ -544,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
OCFS2_I(inode)->ip_open_count--;
/* unlock our journal */
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
brelse(journal->j_bh);
journal->j_bh = NULL;
@@ -883,8 +886,8 @@ restart:
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
- * node(s) may have voted "no" on an inode delete earlier. A
- * revote is therefore required. */
+ * node(s) may have disallowd a previos inode delete. Re-processing
+ * is therefore required. */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
NULL);
@@ -973,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
}
SET_INODE_JOURNAL(inode);
- status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+ status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
- mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+ mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not lock journal!\n");
goto done;
@@ -1047,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
done:
/* drop the lock on this nodes journal */
if (got_lock)
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
if (inode)
iput(inode);
@@ -1162,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
SET_INODE_JOURNAL(inode);
flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
- status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
+ status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
if (status < 0) {
if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail:
if (inode)
iput(inode);
@@ -1241,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
- OCFS2_FI_FLAG_ORPHAN_RECOVERY);
+ OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
return 0;
@@ -1277,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
}
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
+ status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1293,7 +1296,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
*head = priv.head;
out_cluster:
- ocfs2_meta_unlock(orphan_dir_inode, 0);
+ ocfs2_inode_unlock(orphan_dir_inode, 0);
out:
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
@@ -1380,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
spin_lock(&oi->ip_lock);
- /* Delete voting may have set these on the assumption
- * that the other node would wipe them successfully.
- * If they are still in the node's orphan dir, we need
- * to reset that state. */
+ /* The remote delete code may have set these on the
+ * assumption that the other node would wipe them
+ * successfully. If they are still in the node's
+ * orphan dir, we need to reset that state. */
oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
/* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e096156..220f3e818e7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
/* get one bit out of a suballocator: dinode + group descriptor +
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 58ea88b5af3..add1ffdc5c6 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
-/*
- * Determine how large our local alloc window should be, in bits.
- *
- * These values (and the behavior in ocfs2_alloc_should_use_local) have
- * been chosen so that most allocations, including new block groups go
- * through local alloc.
- */
static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
{
- BUG_ON(osb->s_clustersize_bits < 12);
+ BUG_ON(osb->s_clustersize_bits > 20);
- return 2048 >> (osb->s_clustersize_bits - 12);
+ /* Size local alloc windows by the megabyte */
+ return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
}
/*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{
int la_bits = ocfs2_local_alloc_window_bits(osb);
+ int ret = 0;
if (osb->local_alloc_state != OCFS2_LA_ENABLED)
- return 0;
+ goto bail;
/* la_bits should be at least twice the size (in clusters) of
* a new block group. We want to be sure block group
* allocations go through the local alloc, so allow an
* allocation to take up to half the bitmap. */
if (bits > (la_bits / 2))
- return 0;
+ goto bail;
- return 1;
+ ret = 1;
+bail:
+ mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
+ osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+ return ret;
}
int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
mlog_entry_void();
+ if (ocfs2_mount_local(osb))
+ goto bail;
+
+ if (osb->local_alloc_size == 0)
+ goto bail;
+
+ if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+ mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+ "than max possible %u. Using defaults.\n",
+ ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
+ osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ }
+
/* read the alloc off disk */
inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num);
@@ -181,6 +193,9 @@ bail:
if (inode)
iput(inode);
+ mlog(0, "Local alloc window bits = %d\n",
+ ocfs2_local_alloc_window_bits(osb));
+
mlog_exit(status);
return status;
}
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
mutex_lock(&main_bm_inode->i_mutex);
- status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+ status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
if (main_bm_bh)
brelse(main_bm_bh);
- ocfs2_meta_unlock(main_bm_inode, 1);
+ ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&main_bm_inode->i_mutex);
- status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+ status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
ocfs2_commit_trans(osb, handle);
out_unlock:
- ocfs2_meta_unlock(main_bm_inode, 1);
+ ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
@@ -521,6 +536,9 @@ bail:
iput(local_alloc_inode);
}
+ mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
+ status);
+
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 00000000000..203f8714387
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "locks.h"
+
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+ int cmd, struct file_lock *fl)
+{
+ int ret = 0, level = 0, trylock = 0;
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_lock_res *lockres = &fp->fp_flock;
+
+ if (fl->fl_type == F_WRLCK)
+ level = 1;
+ if (!IS_SETLKW(cmd))
+ trylock = 1;
+
+ mutex_lock(&fp->fp_mutex);
+
+ if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+ lockres->l_level > LKM_NLMODE) {
+ int old_level = 0;
+
+ if (lockres->l_level == LKM_EXMODE)
+ old_level = 1;
+
+ if (level == old_level)
+ goto out;
+
+ /*
+ * Converting an existing lock is not guaranteed to be
+ * atomic, so we can get away with simply unlocking
+ * here and allowing the lock code to try at the new
+ * level.
+ */
+
+ flock_lock_file_wait(file,
+ &(struct file_lock){.fl_type = F_UNLCK});
+
+ ocfs2_file_unlock(file);
+ }
+
+ ret = ocfs2_file_lock(file, level, trylock);
+ if (ret) {
+ if (ret == -EAGAIN && trylock)
+ ret = -EWOULDBLOCK;
+ else
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = flock_lock_file_wait(file, fl);
+
+out:
+ mutex_unlock(&fp->fp_mutex);
+
+ return ret;
+}
+
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+ int ret;
+ struct ocfs2_file_private *fp = file->private_data;
+
+ mutex_lock(&fp->fp_mutex);
+ ocfs2_file_unlock(file);
+ ret = flock_lock_file_wait(file, fl);
+ mutex_unlock(&fp->fp_mutex);
+
+ return ret;
+}
+
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ if (__mandatory_lock(inode))
+ return -ENOLCK;
+
+ if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+ ocfs2_mount_local(osb))
+ return flock_lock_file_wait(file, fl);
+
+ if (fl->fl_type == F_UNLCK)
+ return ocfs2_do_funlock(file, cmd, fl);
+ else
+ return ocfs2_do_flock(file, inode, cmd, fl);
+}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de3..9743ef2324e 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
- * vote.h
+ * locks.h
*
- * description here
+ * Function prototypes for Userspace file locking support
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
@@ -23,26 +23,9 @@
* Boston, MA 021110-1307, USA.
*/
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
-#ifndef VOTE_H
-#define VOTE_H
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
-int ocfs2_vote_thread(void *arg);
-static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
-{
- spin_lock(&osb->vote_task_lock);
- /* make sure the voting thread gets a swipe at whatever changes
- * the caller may have made to the voting state */
- osb->vote_wake_sequence++;
- spin_unlock(&osb->vote_task_lock);
- wake_up(&osb->vote_event);
-}
-
-int ocfs2_request_mount_vote(struct ocfs2_super *osb);
-int ocfs2_request_umount_vote(struct ocfs2_super *osb);
-int ocfs2_register_net_handlers(struct ocfs2_super *osb);
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
- int node_num);
-#endif
+#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d29..3dc18d67557 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
* node. Taking the data lock will also ensure that we don't
* attempt page truncation as part of a downconvert.
*/
- ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_data_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_meta_unlock;
- }
-
ret = __ocfs2_page_mkwrite(inode, di_bh, page);
- ocfs2_data_unlock(inode, 1);
-
-out_meta_unlock:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
brelse(di_bh);
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
out:
ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret = 0, lock_level = 0;
- ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
+ ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
file->f_vfsmnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
+ ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 989ac271858..ae9ad958751 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "vote.h"
#include "buffer_head_io.h"
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
- status = ocfs2_meta_lock(dir, NULL, 0);
+ status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
if (status < 0)
goto bail_add;
- inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+ inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
if (IS_ERR(inode)) {
ret = ERR_PTR(-EACCES);
goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
* unlink on another node will message us to remove that
* dentry under this lock so otherwise we can race this with
- * the vote thread and have a stale dentry. */
- ocfs2_meta_unlock(dir, 0);
+ * the downconvert thread and have a stale dentry. */
+ ocfs2_inode_unlock(dir, 0);
bail:
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
- status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+ status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
if (handle)
ocfs2_commit_trans(osb, handle);
- ocfs2_meta_unlock(dir, 1);
+ ocfs2_inode_unlock(dir, 1);
if (status == -ENOSPC)
mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+ err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out;
}
- err = ocfs2_meta_lock(inode, &fe_bh, 1);
+ err = ocfs2_inode_lock(inode, &fe_bh, 1);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock_inode:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
out:
- ocfs2_meta_unlock(dir, 1);
+ ocfs2_inode_unlock(dir, 1);
if (de_bh)
brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
return -EPERM;
}
- status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
+ status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- status = ocfs2_meta_lock(inode, &fe_bh, 1);
+ status = ocfs2_inode_lock(inode, &fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
status = ocfs2_remote_dentry_delete(dentry);
if (status < 0) {
- /* This vote should succeed under all normal
+ /* This remote delete should succeed under all normal
* circumstances. */
mlog_errno(status);
goto leave;
@@ -841,13 +840,13 @@ leave:
ocfs2_commit_trans(osb, handle);
if (child_locked)
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
- ocfs2_meta_unlock(dir, 1);
+ ocfs2_inode_unlock(dir, 1);
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
- ocfs2_meta_unlock(orphan_dir, 1);
+ ocfs2_inode_unlock(orphan_dir, 1);
mutex_unlock(&orphan_dir->i_mutex);
iput(orphan_dir);
}
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
inode1 = tmpinode;
}
/* lock id2 */
- status = ocfs2_meta_lock(inode2, bh2, 1);
+ status = ocfs2_inode_lock(inode2, bh2, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
- status = ocfs2_meta_lock(inode1, bh1, 1);
+ status = ocfs2_inode_lock(inode1, bh1, 1);
if (status < 0) {
/*
* An error return must mean that no cluster locks
* were held on function exit.
*/
if (oi1->ip_blkno != oi2->ip_blkno)
- ocfs2_meta_unlock(inode2, 1);
+ ocfs2_inode_unlock(inode2, 1);
if (status != -ENOENT)
mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
{
- ocfs2_meta_unlock(inode1, 1);
+ ocfs2_inode_unlock(inode1, 1);
if (inode1 != inode2)
- ocfs2_meta_unlock(inode2, 1);
+ ocfs2_inode_unlock(inode2, 1);
}
static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
/*
* Aside from allowing a meta data update, the locking here
- * also ensures that the vote thread on other nodes won't have
- * to concurrently downconvert the inode and the dentry locks.
+ * also ensures that the downconvert thread on other nodes
+ * won't have to concurrently downconvert the inode and the
+ * dentry locks.
*/
- status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
+ status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
+ status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1355,14 +1355,14 @@ bail:
ocfs2_double_unlock(old_dir, new_dir);
if (old_child_locked)
- ocfs2_meta_unlock(old_inode, 1);
+ ocfs2_inode_unlock(old_inode, 1);
if (new_child_locked)
- ocfs2_meta_unlock(new_inode, 1);
+ ocfs2_inode_unlock(new_inode, 1);
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
- ocfs2_meta_unlock(orphan_dir, 1);
+ ocfs2_inode_unlock(orphan_dir, 1);
mutex_unlock(&orphan_dir->i_mutex);
iput(orphan_dir);
}
@@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
credits = ocfs2_calc_symlink_credits(sb);
/* lock the parent directory */
- status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+ status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1657,7 +1657,7 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
- ocfs2_meta_unlock(dir, 1);
+ ocfs2_inode_unlock(dir, 1);
if (new_fe_bh)
brelse(new_fe_bh);
@@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
orphan_dir_bh, name,
OCFS2_ORPHAN_NAMELEN, de_bh);
if (status < 0) {
- ocfs2_meta_unlock(orphan_dir_inode, 1);
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mlog_errno(status);
goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b..d0848058047 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
* about to be
* dropped. */
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
struct ocfs2_lock_res_ops;
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
struct ocfs2_slot_info *slot_info;
spinlock_t node_map_lock;
- struct ocfs2_node_map mounted_map;
struct ocfs2_node_map recovery_map;
- struct ocfs2_node_map umount_map;
u64 root_blkno;
u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
wait_queue_head_t checkpoint_event;
atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
+ unsigned long osb_commit_interval;
+ int local_alloc_size;
enum ocfs2_local_alloc_state local_alloc_state;
struct buffer_head *local_alloc_bh;
u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
wait_queue_head_t recovery_event;
- spinlock_t vote_task_lock;
- struct task_struct *vote_task;
- wait_queue_head_t vote_event;
- unsigned long vote_wake_sequence;
- unsigned long vote_work_sequence;
+ spinlock_t dc_task_lock;
+ struct task_struct *dc_task;
+ wait_queue_head_t dc_event;
+ unsigned long dc_wake_sequence;
+ unsigned long dc_work_sequence;
+ /*
+ * Any thread can add locks to the list, but the downconvert
+ * thread is the only one allowed to remove locks. Any change
+ * to this rule requires updating
+ * ocfs2_downconvert_thread_do_work().
+ */
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
- struct list_head vote_list;
- int vote_count;
-
- u32 net_key;
- spinlock_t net_response_lock;
- unsigned int net_response_ids;
- struct list_head net_response_list;
-
- struct o2hb_callback_func osb_hb_up;
- struct o2hb_callback_func osb_hb_down;
-
- struct list_head osb_net_handlers;
-
wait_queue_head_t osb_mount_event;
/* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a7..3633edd3982 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+ __u64 group; /* Group descriptor's blkno. */
+ __u32 clusters; /* Total number of clusters in this group */
+ __u32 frees; /* Total free clusters in this group */
+ __u16 chain; /* Chain for this group */
+ __u16 reserved1;
+ __u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
+
/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
*/
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
+/*
+ * Default local alloc size (in megabytes)
+ *
+ * The value chosen should be such that most allocations, including new
+ * block groups, use local alloc.
+ */
+#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38a..86f3e3799c2 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
+ OCFS2_LOCK_TYPE_FLOCK,
OCFS2_NUM_LOCK_TYPES
};
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_OPEN:
c = 'O';
break;
+ case OCFS2_LOCK_TYPE_FLOCK:
+ c = 'F';
+ break;
default:
c = '\0';
}
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
+ [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 00000000000..37835ffcb03
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+ struct ocfs2_group_desc *gd,
+ int new_clusters,
+ u32 first_new_cluster,
+ u16 cl_cpg,
+ int set)
+{
+ int i;
+ u16 backups = 0;
+ u32 cluster;
+ u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+ blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+ cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+ gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+ if (gd_blkno < lgd_blkno)
+ continue;
+ else if (gd_blkno > lgd_blkno)
+ break;
+
+ if (set)
+ ocfs2_set_bit(cluster % cl_cpg,
+ (unsigned long *)gd->bg_bitmap);
+ else
+ ocfs2_clear_bit(cluster % cl_cpg,
+ (unsigned long *)gd->bg_bitmap);
+ backups++;
+ }
+
+ mlog_exit_void();
+ return backups;
+}
+
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+ struct inode *bm_inode,
+ struct buffer_head *bm_bh,
+ struct buffer_head *group_bh,
+ u32 first_new_cluster,
+ int new_clusters)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+ struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+ struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+ struct ocfs2_chain_rec *cr;
+ struct ocfs2_group_desc *group;
+ u16 chain, num_bits, backups = 0;
+ u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+ u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+
+ mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
+ new_clusters, first_new_cluster);
+
+ ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+ /* update the group first. */
+ num_bits = new_clusters * cl_bpc;
+ le16_add_cpu(&group->bg_bits, num_bits);
+ le16_add_cpu(&group->bg_free_bits_count, num_bits);
+
+ /*
+ * check whether there are some new backup superblocks exist in
+ * this group and update the group bitmap accordingly.
+ */
+ if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+ backups = ocfs2_calc_new_backup_super(bm_inode,
+ group,
+ new_clusters,
+ first_new_cluster,
+ cl_cpg, 1);
+ le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+ }
+
+ ret = ocfs2_journal_dirty(handle, group_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_rollback;
+ }
+
+ /* update the inode accordingly. */
+ ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_rollback;
+ }
+
+ chain = le16_to_cpu(group->bg_chain);
+ cr = (&cl->cl_recs[chain]);
+ le32_add_cpu(&cr->c_total, num_bits);
+ le32_add_cpu(&cr->c_free, num_bits);
+ le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+ le32_add_cpu(&fe->i_clusters, new_clusters);
+
+ if (backups) {
+ le32_add_cpu(&cr->c_free, -1 * backups);
+ le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+ }
+
+ spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+ OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+ le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+ spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+ i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+
+ ocfs2_journal_dirty(handle, bm_bh);
+
+out_rollback:
+ if (ret < 0) {
+ ocfs2_calc_new_backup_super(bm_inode,
+ group,
+ new_clusters,
+ first_new_cluster,
+ cl_cpg, 0);
+ le16_add_cpu(&group->bg_free_bits_count, backups);
+ le16_add_cpu(&group->bg_bits, -1 * num_bits);
+ le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+ }
+out:
+ mlog_exit(ret);
+ return ret;
+}
+
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+ int i, ret = 0;
+ u32 cluster;
+ u64 blkno;
+ struct buffer_head *backup = NULL;
+ struct ocfs2_dinode *backup_di = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ /* calculate the real backups we need to update. */
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+ blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+ cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+ if (cluster > clusters)
+ break;
+
+ ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+
+ backup_di = (struct ocfs2_dinode *)backup->b_data;
+ backup_di->i_blkno = cpu_to_le64(blkno);
+
+ ret = ocfs2_write_super_or_backup(osb, backup);
+ brelse(backup);
+ backup = NULL;
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void ocfs2_update_super_and_backups(struct inode *inode,
+ int new_clusters)
+{
+ int ret;
+ u32 clusters = 0;
+ struct buffer_head *super_bh = NULL;
+ struct ocfs2_dinode *super_di = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ /*
+ * update the superblock last.
+ * It doesn't matter if the write failed.
+ */
+ ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
+ &super_bh, 0, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ super_di = (struct ocfs2_dinode *)super_bh->b_data;
+ le32_add_cpu(&super_di->i_clusters, new_clusters);
+ clusters = le32_to_cpu(super_di->i_clusters);
+
+ ret = ocfs2_write_super_or_backup(osb, super_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+ ret = update_backups(inode, clusters, super_bh->b_data);
+
+out:
+ brelse(super_bh);
+ if (ret)
+ printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+ " during fs resize. This condition is not fatal,"
+ " but fsck.ocfs2 should be run to fix it\n",
+ osb->dev_str);
+ return;
+}
+
+/*
+ * Extend the filesystem to the new number of clusters specified. This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+ int ret;
+ handle_t *handle;
+ struct buffer_head *main_bm_bh = NULL;
+ struct buffer_head *group_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct ocfs2_dinode *fe = NULL;
+ struct ocfs2_group_desc *group = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u16 cl_bpc;
+ u32 first_new_cluster;
+ u64 lgd_blkno;
+
+ mlog_entry_void();
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ if (new_clusters < 0)
+ return -EINVAL;
+ else if (new_clusters == 0)
+ return 0;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+ ocfs2_group_bitmap_size(osb->sb) * 8) {
+ mlog(ML_ERROR, "The disk is too old and small. "
+ "Force to do offline resize.");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(fe)) {
+ OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ first_new_cluster = le32_to_cpu(fe->i_clusters);
+ lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+ first_new_cluster - 1);
+
+ ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
+ main_bm_inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+ ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+ if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+ le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ mlog(0, "extend the last group at %llu, new clusters = %d\n",
+ (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
+
+ handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+ if (IS_ERR(handle)) {
+ mlog_errno(PTR_ERR(handle));
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* update the last group descriptor and inode. */
+ ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+ main_bm_bh, group_bh,
+ first_new_cluster,
+ new_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ brelse(group_bh);
+ brelse(main_bm_bh);
+
+ ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+
+out:
+ mlog_exit_void();
+ return ret;
+}
+
+static int ocfs2_check_new_group(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_new_group_input *input,
+ struct buffer_head *group_bh)
+{
+ int ret;
+ struct ocfs2_group_desc *gd;
+ u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+ unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
+ le16_to_cpu(di->id2.i_chain.cl_bpc);
+
+
+ gd = (struct ocfs2_group_desc *)group_bh->b_data;
+
+ ret = -EIO;
+ if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+ mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno));
+ else if (di->i_blkno != gd->bg_parent_dinode)
+ mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
+ "pointer (%llu, expected %llu)\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ else if (le16_to_cpu(gd->bg_bits) > max_bits)
+ mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits));
+ else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
+ mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+ "claims that %u are free\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ le16_to_cpu(gd->bg_free_bits_count));
+ else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
+ mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+ "max bitmap bits of %u\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ 8 * le16_to_cpu(gd->bg_size));
+ else if (le16_to_cpu(gd->bg_chain) != input->chain)
+ mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+ "while input has %u set.\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_chain), input->chain);
+ else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+ mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+ "input has %u clusters set\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits), input->clusters);
+ else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+ mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+ "but it should have %u set\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ input->frees * cl_bpc);
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ocfs2_verify_group_and_input(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_new_group_input *input,
+ struct buffer_head *group_bh)
+{
+ u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+ u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+ u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+ u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+ u32 total_clusters = le32_to_cpu(di->i_clusters);
+ int ret = -EINVAL;
+
+ if (cluster < total_clusters)
+ mlog(ML_ERROR, "add a group which is in the current volume.\n");
+ else if (input->chain >= cl_count)
+ mlog(ML_ERROR, "input chain exceeds the limit.\n");
+ else if (next_free != cl_count && next_free != input->chain)
+ mlog(ML_ERROR,
+ "the add group should be in chain %u\n", next_free);
+ else if (total_clusters + input->clusters < total_clusters)
+ mlog(ML_ERROR, "add group's clusters overflow.\n");
+ else if (input->clusters > cl_cpg)
+ mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+ else if (input->frees > input->clusters)
+ mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+ else if (total_clusters % cl_cpg != 0)
+ mlog(ML_ERROR,
+ "the last group isn't full. Use group extend first.\n");
+ else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+ mlog(ML_ERROR, "group blkno is invalid\n");
+ else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+ mlog(ML_ERROR, "group descriptor check failed.\n");
+ else
+ ret = 0;
+
+ return ret;
+}
+
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+ int ret;
+ handle_t *handle;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct ocfs2_dinode *fe = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *group_bh = NULL;
+ struct ocfs2_group_desc *group = NULL;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_chain_rec *cr;
+ u16 cl_bpc;
+
+ mlog_entry_void();
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+ ocfs2_group_bitmap_size(osb->sb) * 8) {
+ mlog(ML_ERROR, "The disk is too old and small."
+ " Force to do offline resize.");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+ if (ret < 0) {
+ mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+ "from the device.", (unsigned long long)input->group);
+ goto out_unlock;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, group_bh);
+
+ ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
+ (unsigned long long)input->group, input->chain, input->clusters);
+
+ handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+ if (IS_ERR(handle)) {
+ mlog_errno(PTR_ERR(handle));
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+ cl = &fe->id2.i_chain;
+ cr = &cl->cl_recs[input->chain];
+
+ ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+ group->bg_next_group = cr->c_blkno;
+
+ ret = ocfs2_journal_dirty(handle, group_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+ le16_add_cpu(&cl->cl_next_free_rec, 1);
+ memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+ }
+
+ cr->c_blkno = le64_to_cpu(input->group);
+ le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+ le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+
+ le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+ le32_add_cpu(&fe->id1.bitmap1.i_used,
+ (input->clusters - input->frees) * cl_bpc);
+ le32_add_cpu(&fe->i_clusters, input->clusters);
+
+ ocfs2_journal_dirty(handle, main_bm_bh);
+
+ spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+ OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+ le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+ spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+ i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+
+ ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ brelse(group_bh);
+ brelse(main_bm_bh);
+
+ ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+
+out:
+ mlog_exit_void();
+ return ret;
+}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 00000000000..f38841abf10
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
+
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
+
+#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cf..3a50ce555e6 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
s16 slot_num,
s16 node_num);
-/* Use the slot information we've collected to create a map of mounted
- * nodes. Should be holding an EX on super block. assumes slot info is
- * up to date. Note that we call this *after* we find a slot, so our
- * own node should be set in the map too... */
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
-{
- int i;
- struct ocfs2_slot_info *si = osb->slot_info;
-
- spin_lock(&si->si_lock);
-
- for (i = 0; i < si->si_size; i++)
- if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
- ocfs2_node_map_set_bit(osb, &osb->mounted_map,
- si->si_global_node_nums[i]);
-
- spin_unlock(&si->si_lock);
-}
-
/* post the slot information on disk into our slot_info struct. */
void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
{
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031..1025872aaad 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
void ocfs2_clear_slot(struct ocfs2_slot_info *si,
s16 slot_num);
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
-
static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
int slot_num)
{
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3..7e397e2c25d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
u64 bg_blkno,
u16 bg_bit_off);
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
- u32 cluster);
static inline void ocfs2_block_to_cluster_group(struct inode *inode,
u64 data_blkno,
u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
if (inode) {
if (ac->ac_which != OCFS2_AC_USE_LOCAL)
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
mutex_unlock(&inode->i_mutex);
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
/* somewhat more expensive than our other checks, so use sparingly. */
-static int ocfs2_check_group_descriptor(struct super_block *sb,
- struct ocfs2_dinode *di,
- struct ocfs2_group_desc *gd)
+int ocfs2_check_group_descriptor(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct ocfs2_group_desc *gd)
{
unsigned int max_bits;
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
mutex_lock(&alloc_inode->i_mutex);
- status = ocfs2_meta_lock(alloc_inode, &bh, 1);
+ status = ocfs2_inode_lock(alloc_inode, &bh, 1);
if (status < 0) {
mutex_unlock(&alloc_inode->i_mutex);
iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
- u32 cluster)
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
if (min_clusters > (osb->bitmap_cpg - 1)) {
/* The only paths asking for contiguousness
* should know about this already. */
- mlog(ML_ERROR, "minimum allocation requested exceeds "
- "group bitmap size!");
+ mlog(ML_ERROR, "minimum allocation requested %u exceeds "
+ "group bitmap size %u!\n", min_clusters,
+ osb->bitmap_cpg);
status = -ENOSPC;
goto bail;
}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe9370309..8799033bb45 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+
+/* somewhat more expensive than our other checks, so use sparingly. */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct ocfs2_group_desc *gd);
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5ee77542066..01fe40ee5ea 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
#include "sysfile.h"
#include "uptodate.h"
#include "ver.h"
-#include "vote.h"
#include "buffer_head_io.h"
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
struct mount_options
{
+ unsigned long commit_interval;
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
+ unsigned int localalloc_opt;
};
static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
Opt_data_writeback,
Opt_atime_quantum,
Opt_slot,
+ Opt_commit,
+ Opt_localalloc,
+ Opt_localflocks,
Opt_err,
};
@@ -165,6 +169,9 @@ static match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
{Opt_slot, "preferred_slot=%u"},
+ {Opt_commit, "commit=%u"},
+ {Opt_localalloc, "localalloc=%d"},
+ {Opt_localflocks, "localflocks"},
{Opt_err, NULL}
};
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
mlog_entry_void();
- new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
+ new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
}
osb->root_inode = new;
- new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
+ new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
@@ -443,6 +450,8 @@ unlock_osb:
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
+ if (parsed_options.commit_interval)
+ osb->osb_commit_interval = parsed_options.commit_interval;
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
+ osb->osb_commit_interval = parsed_options.commit_interval;
+ osb->local_alloc_size = parsed_options.localalloc_opt;
sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
options ? options : "(none)");
+ mopt->commit_interval = 0;
mopt->mount_opt = 0;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
+ mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
if (!options) {
status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
if (option)
mopt->slot = (s16)option;
break;
+ case Opt_commit:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option < 0)
+ return 0;
+ if (option == 0)
+ option = JBD_DEFAULT_MAX_COMMIT_AGE;
+ mopt->commit_interval = HZ * option;
+ break;
+ case Opt_localalloc:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+ mopt->localalloc_opt = option;
+ break;
+ case Opt_localflocks:
+ /*
+ * Changing this during remount could race
+ * flock() requests, or "unbalance" existing
+ * ones (e.g., a lock is taken in one mode but
+ * dropped in the other). If users care enough
+ * to flip locking modes during remount, we
+ * could add a "local" flag to individual
+ * flock structures for proper tracking of
+ * state.
+ */
+ if (!is_remount)
+ mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+ if (osb->osb_commit_interval)
+ seq_printf(s, ",commit=%u",
+ (unsigned) (osb->osb_commit_interval / HZ));
+
+ if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+
+ if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+ seq_printf(s, ",localflocks,");
+
return 0;
}
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
goto bail;
}
- status = ocfs2_meta_lock(inode, &bh, 0);
+ status = ocfs2_inode_lock(inode, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
brelse(bh);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
oi->ip_clusters = 0;
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
- ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
- ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
goto leave;
}
- status = ocfs2_register_hb_callbacks(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- /* requires vote_thread to be running. */
- status = ocfs2_register_net_handlers(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
goto leave;
}
- ocfs2_populate_mounted_map(osb);
-
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_mount_local(osb))
goto leave;
- /* This should be sent *after* we recovered our journal as it
- * will cause other nodes to unmark us as needing
- * recovery. However, we need to send it *before* dropping the
- * super block lock as otherwise their recovery threads might
- * try to clean us up while we're live! */
- status = ocfs2_request_mount_vote(osb);
- if (status < 0)
- mlog_errno(status);
-
leave:
if (unlock_super)
ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
mlog_errno(tmp);
return;
}
-
- tmp = ocfs2_request_umount_vote(osb);
- if (tmp < 0)
- mlog_errno(tmp);
}
if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_release_system_inodes(osb);
- if (osb->dlm) {
- ocfs2_unregister_net_handlers(osb);
-
+ if (osb->dlm)
ocfs2_dlm_shutdown(osb);
- }
-
- ocfs2_clear_hb_callbacks(osb);
debugfs_remove(osb->osb_debug_root);
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct buffer_head *bitmap_bh = NULL;
struct ocfs2_journal *journal;
__le32 uuid_net_key;
struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
- osb->net_response_ids = 0;
- spin_lock_init(&osb->net_response_lock);
- INIT_LIST_HEAD(&osb->net_response_list);
-
- INIT_LIST_HEAD(&osb->osb_net_handlers);
init_waitqueue_head(&osb->recovery_event);
- spin_lock_init(&osb->vote_task_lock);
- init_waitqueue_head(&osb->vote_event);
- osb->vote_work_sequence = 0;
- osb->vote_wake_sequence = 0;
+ spin_lock_init(&osb->dc_task_lock);
+ init_waitqueue_head(&osb->dc_event);
+ osb->dc_work_sequence = 0;
+ osb->dc_wake_sequence = 0;
INIT_LIST_HEAD(&osb->blocked_lock_list);
osb->blocked_lock_count = 0;
- INIT_LIST_HEAD(&osb->vote_list);
spin_lock_init(&osb->osb_lock);
atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
- osb->net_key = le32_to_cpu(uuid_net_key);
strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
-
- /* We don't have a cluster lock on the bitmap here because
- * we're only interested in static information and the extra
- * complexity at mount time isn't worht it. Don't pass the
- * inode in to the read function though as we don't want it to
- * be put in the cache. */
- status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
- NULL);
iput(inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- di = (struct ocfs2_dinode *) bitmap_bh->b_data;
- osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
- brelse(bitmap_bh);
- mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
- (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
+ osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6..ab713ebdd54 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
goto bail;
}
- inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
+ inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c9..e2488f4128a 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
#include "ver.h"
-#define OCFS2_BUILD_VERSION "1.3.3"
+#define OCFS2_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2..00000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.c
- *
- * description here
- *
- * Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
-#include <dlm/dlmapi.h>
-
-#define MLOG_MASK_PREFIX ML_VOTE
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "journal.h"
-#include "slot_map.h"
-#include "vote.h"
-
-#include "buffer_head_io.h"
-
-#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
-#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
-struct ocfs2_msg_hdr
-{
- __be32 h_response_id; /* used to lookup message handle on sending
- * node. */
- __be32 h_request;
- __be64 h_blkno;
- __be32 h_generation;
- __be32 h_node_num; /* node sending this particular message. */
-};
-
-struct ocfs2_vote_msg
-{
- struct ocfs2_msg_hdr v_hdr;
- __be32 v_reserved1;
-} __attribute__ ((packed));
-
-/* Responses are given these values to maintain backwards
- * compatibility with older ocfs2 versions */
-#define OCFS2_RESPONSE_OK (0)
-#define OCFS2_RESPONSE_BUSY (-16)
-#define OCFS2_RESPONSE_BAD_MSG (-22)
-
-struct ocfs2_response_msg
-{
- struct ocfs2_msg_hdr r_hdr;
- __be32 r_response;
-} __attribute__ ((packed));
-
-struct ocfs2_vote_work {
- struct list_head w_list;
- struct ocfs2_vote_msg w_msg;
-};
-
-enum ocfs2_vote_request {
- OCFS2_VOTE_REQ_INVALID = 0,
- OCFS2_VOTE_REQ_MOUNT,
- OCFS2_VOTE_REQ_UMOUNT,
- OCFS2_VOTE_REQ_LAST
-};
-
-static inline int ocfs2_is_valid_vote_request(int request)
-{
- return OCFS2_VOTE_REQ_INVALID < request &&
- request < OCFS2_VOTE_REQ_LAST;
-}
-
-typedef void (*ocfs2_net_response_callback)(void *priv,
- struct ocfs2_response_msg *resp);
-struct ocfs2_net_response_cb {
- ocfs2_net_response_callback rc_cb;
- void *rc_priv;
-};
-
-struct ocfs2_net_wait_ctxt {
- struct list_head n_list;
- u32 n_response_id;
- wait_queue_head_t n_event;
- struct ocfs2_node_map n_node_map;
- int n_response; /* an agreggate response. 0 if
- * all nodes are go, < 0 on any
- * negative response from any
- * node or network error. */
- struct ocfs2_net_response_cb *n_callback;
-};
-
-static void ocfs2_process_mount_request(struct ocfs2_super *osb,
- unsigned int node_num)
-{
- mlog(0, "MOUNT vote from node %u\n", node_num);
- /* The other node only sends us this message when he has an EX
- * on the superblock, so our recovery threads (if having been
- * launched) are waiting on it.*/
- ocfs2_recovery_map_clear(osb, node_num);
- ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
-
- /* We clear the umount map here because a node may have been
- * previously mounted, safely unmounted but never stopped
- * heartbeating - in which case we'd have a stale entry. */
- ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-
-static void ocfs2_process_umount_request(struct ocfs2_super *osb,
- unsigned int node_num)
-{
- mlog(0, "UMOUNT vote from node %u\n", node_num);
- ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
- ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
-}
-
-static void ocfs2_process_vote(struct ocfs2_super *osb,
- struct ocfs2_vote_msg *msg)
-{
- int net_status, vote_response;
- unsigned int node_num;
- u64 blkno;
- enum ocfs2_vote_request request;
- struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
- struct ocfs2_response_msg response;
-
- /* decode the network mumbo jumbo into local variables. */
- request = be32_to_cpu(hdr->h_request);
- blkno = be64_to_cpu(hdr->h_blkno);
- node_num = be32_to_cpu(hdr->h_node_num);
-
- mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
- request, (unsigned long long)blkno, node_num);
-
- if (!ocfs2_is_valid_vote_request(request)) {
- mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
- request, node_num);
- vote_response = OCFS2_RESPONSE_BAD_MSG;
- goto respond;
- }
-
- vote_response = OCFS2_RESPONSE_OK;
-
- switch (request) {
- case OCFS2_VOTE_REQ_UMOUNT:
- ocfs2_process_umount_request(osb, node_num);
- goto respond;
- case OCFS2_VOTE_REQ_MOUNT:
- ocfs2_process_mount_request(osb, node_num);
- goto respond;
- default:
- /* avoids a gcc warning */
- break;
- }
-
-respond:
- /* Response struture is small so we just put it on the stack
- * and stuff it inline. */
- memset(&response, 0, sizeof(struct ocfs2_response_msg));
- response.r_hdr.h_response_id = hdr->h_response_id;
- response.r_hdr.h_blkno = hdr->h_blkno;
- response.r_hdr.h_generation = hdr->h_generation;
- response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
- response.r_response = cpu_to_be32(vote_response);
-
- net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
- osb->net_key,
- &response,
- sizeof(struct ocfs2_response_msg),
- node_num,
- NULL);
- /* We still want to error print for ENOPROTOOPT here. The
- * sending node shouldn't have unregistered his net handler
- * without sending an unmount vote 1st */
- if (net_status < 0
- && net_status != -ETIMEDOUT
- && net_status != -ENOTCONN)
- mlog(ML_ERROR, "message to node %u fails with error %d!\n",
- node_num, net_status);
-}
-
-static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
-{
- unsigned long processed;
- struct ocfs2_lock_res *lockres;
- struct ocfs2_vote_work *work;
-
- mlog_entry_void();
-
- spin_lock(&osb->vote_task_lock);
- /* grab this early so we know to try again if a state change and
- * wake happens part-way through our work */
- osb->vote_work_sequence = osb->vote_wake_sequence;
-
- processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
- lockres = list_entry(osb->blocked_lock_list.next,
- struct ocfs2_lock_res, l_blocked_list);
- list_del_init(&lockres->l_blocked_list);
- osb->blocked_lock_count--;
- spin_unlock(&osb->vote_task_lock);
-
- BUG_ON(!processed);
- processed--;
-
- ocfs2_process_blocked_lock(osb, lockres);
-
- spin_lock(&osb->vote_task_lock);
- }
-
- while (osb->vote_count) {
- BUG_ON(list_empty(&osb->vote_list));
- work = list_entry(osb->vote_list.next,
- struct ocfs2_vote_work, w_list);
- list_del(&work->w_list);
- osb->vote_count--;
- spin_unlock(&osb->vote_task_lock);
-
- ocfs2_process_vote(osb, &work->w_msg);
- kfree(work);
-
- spin_lock(&osb->vote_task_lock);
- }
- spin_unlock(&osb->vote_task_lock);
-
- mlog_exit_void();
-}
-
-static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
-{
- int empty = 0;
-
- spin_lock(&osb->vote_task_lock);
- if (list_empty(&osb->blocked_lock_list) &&
- list_empty(&osb->vote_list))
- empty = 1;
-
- spin_unlock(&osb->vote_task_lock);
- return empty;
-}
-
-static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
-{
- int should_wake = 0;
-
- spin_lock(&osb->vote_task_lock);
- if (osb->vote_work_sequence != osb->vote_wake_sequence)
- should_wake = 1;
- spin_unlock(&osb->vote_task_lock);
-
- return should_wake;
-}
-
-int ocfs2_vote_thread(void *arg)
-{
- int status = 0;
- struct ocfs2_super *osb = arg;
-
- /* only quit once we've been asked to stop and there is no more
- * work available */
- while (!(kthread_should_stop() &&
- ocfs2_vote_thread_lists_empty(osb))) {
-
- wait_event_interruptible(osb->vote_event,
- ocfs2_vote_thread_should_wake(osb) ||
- kthread_should_stop());
-
- mlog(0, "vote_thread: awoken\n");
-
- ocfs2_vote_thread_do_work(osb);
- }
-
- osb->vote_task = NULL;
- return status;
-}
-
-static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
-{
- struct ocfs2_net_wait_ctxt *w;
-
- w = kzalloc(sizeof(*w), GFP_NOFS);
- if (!w) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
-
- INIT_LIST_HEAD(&w->n_list);
- init_waitqueue_head(&w->n_event);
- ocfs2_node_map_init(&w->n_node_map);
- w->n_response_id = response_id;
- w->n_callback = NULL;
-bail:
- return w;
-}
-
-static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
-{
- unsigned int ret;
-
- spin_lock(&osb->net_response_lock);
- ret = ++osb->net_response_ids;
- spin_unlock(&osb->net_response_lock);
-
- return ret;
-}
-
-static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
- struct ocfs2_net_wait_ctxt *w)
-{
- spin_lock(&osb->net_response_lock);
- list_del(&w->n_list);
- spin_unlock(&osb->net_response_lock);
-}
-
-static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
- struct ocfs2_net_wait_ctxt *w)
-{
- spin_lock(&osb->net_response_lock);
- list_add_tail(&w->n_list,
- &osb->net_response_list);
- spin_unlock(&osb->net_response_lock);
-}
-
-static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
- struct ocfs2_net_wait_ctxt *w,
- int node_num)
-{
- assert_spin_locked(&osb->net_response_lock);
-
- ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
- if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
- wake_up(&w->n_event);
-}
-
-/* Intended to be called from the node down callback, we fake remove
- * the node from all our response contexts */
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
- int node_num)
-{
- struct list_head *p;
- struct ocfs2_net_wait_ctxt *w = NULL;
-
- spin_lock(&osb->net_response_lock);
-
- list_for_each(p, &osb->net_response_list) {
- w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-
- __ocfs2_mark_node_responded(osb, w, node_num);
- }
-
- spin_unlock(&osb->net_response_lock);
-}
-
-static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
- struct ocfs2_vote_msg *request,
- unsigned int response_id,
- int *response,
- struct ocfs2_net_response_cb *callback)
-{
- int status, i, remote_err;
- struct ocfs2_net_wait_ctxt *w = NULL;
- int dequeued = 0;
-
- mlog_entry_void();
-
- w = ocfs2_new_net_wait_ctxt(response_id);
- if (!w) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
- w->n_callback = callback;
-
- /* we're pretty much ready to go at this point, and this fills
- * in n_response which we need anyway... */
- ocfs2_queue_net_wait_ctxt(osb, w);
-
- i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
-
- while (i != O2NM_INVALID_NODE_NUM) {
- if (i != osb->node_num) {
- mlog(0, "trying to send request to node %i\n", i);
- ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
-
- remote_err = 0;
- status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
- osb->net_key,
- request,
- sizeof(*request),
- i,
- &remote_err);
- if (status == -ETIMEDOUT) {
- mlog(0, "remote node %d timed out!\n", i);
- status = -EAGAIN;
- goto bail;
- }
- if (remote_err < 0) {
- status = remote_err;
- mlog(0, "remote error %d on node %d!\n",
- remote_err, i);
- mlog_errno(status);
- goto bail;
- }
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
- i++;
- i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
- mlog(0, "next is %d, i am %d\n", i, osb->node_num);
- }
- mlog(0, "done sending, now waiting on responses...\n");
-
- wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
-
- ocfs2_dequeue_net_wait_ctxt(osb, w);
- dequeued = 1;
-
- *response = w->n_response;
- status = 0;
-bail:
- if (w) {
- if (!dequeued)
- ocfs2_dequeue_net_wait_ctxt(osb, w);
- kfree(w);
- }
-
- mlog_exit(status);
- return status;
-}
-
-static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
- u64 blkno,
- unsigned int generation,
- enum ocfs2_vote_request type)
-{
- struct ocfs2_vote_msg *request;
- struct ocfs2_msg_hdr *hdr;
-
- BUG_ON(!ocfs2_is_valid_vote_request(type));
-
- request = kzalloc(sizeof(*request), GFP_NOFS);
- if (!request) {
- mlog_errno(-ENOMEM);
- } else {
- hdr = &request->v_hdr;
- hdr->h_node_num = cpu_to_be32(osb->node_num);
- hdr->h_request = cpu_to_be32(type);
- hdr->h_blkno = cpu_to_be64(blkno);
- hdr->h_generation = cpu_to_be32(generation);
- }
-
- return request;
-}
-
-/* Complete the buildup of a new vote request and process the
- * broadcast return value. */
-static int ocfs2_do_request_vote(struct ocfs2_super *osb,
- struct ocfs2_vote_msg *request,
- struct ocfs2_net_response_cb *callback)
-{
- int status, response = -EBUSY;
- unsigned int response_id;
- struct ocfs2_msg_hdr *hdr;
-
- response_id = ocfs2_new_response_id(osb);
-
- hdr = &request->v_hdr;
- hdr->h_response_id = cpu_to_be32(response_id);
-
- status = ocfs2_broadcast_vote(osb, request, response_id, &response,
- callback);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = response;
-bail:
-
- return status;
-}
-
-int ocfs2_request_mount_vote(struct ocfs2_super *osb)
-{
- int status;
- struct ocfs2_vote_msg *request = NULL;
-
- request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
- if (!request) {
- status = -ENOMEM;
- goto bail;
- }
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
- signal_pending(current)) {
- status = -ERESTARTSYS;
- goto bail;
- }
-
- if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
- osb->node_num)) {
- status = 0;
- goto bail;
- }
-
- status = ocfs2_do_request_vote(osb, request, NULL);
- }
-
-bail:
- kfree(request);
- return status;
-}
-
-int ocfs2_request_umount_vote(struct ocfs2_super *osb)
-{
- int status;
- struct ocfs2_vote_msg *request = NULL;
-
- request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
- if (!request) {
- status = -ENOMEM;
- goto bail;
- }
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- /* Do not check signals on this vote... We really want
- * this one to go all the way through. */
-
- if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
- osb->node_num)) {
- status = 0;
- goto bail;
- }
-
- status = ocfs2_do_request_vote(osb, request, NULL);
- }
-
-bail:
- kfree(request);
- return status;
-}
-
-/* TODO: This should eventually be a hash table! */
-static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
- u32 response_id)
-{
- struct list_head *p;
- struct ocfs2_net_wait_ctxt *w = NULL;
-
- list_for_each(p, &osb->net_response_list) {
- w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
- if (response_id == w->n_response_id)
- break;
- w = NULL;
- }
-
- return w;
-}
-
-/* Translate response codes into local node errno values */
-static inline int ocfs2_translate_response(int response)
-{
- int ret;
-
- switch (response) {
- case OCFS2_RESPONSE_OK:
- ret = 0;
- break;
-
- case OCFS2_RESPONSE_BUSY:
- ret = -EBUSY;
- break;
-
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
-static int ocfs2_handle_response_message(struct o2net_msg *msg,
- u32 len,
- void *data, void **ret_data)
-{
- unsigned int response_id, node_num;
- int response_status;
- struct ocfs2_super *osb = data;
- struct ocfs2_response_msg *resp;
- struct ocfs2_net_wait_ctxt * w;
- struct ocfs2_net_response_cb *resp_cb;
-
- resp = (struct ocfs2_response_msg *) msg->buf;
-
- response_id = be32_to_cpu(resp->r_hdr.h_response_id);
- node_num = be32_to_cpu(resp->r_hdr.h_node_num);
- response_status =
- ocfs2_translate_response(be32_to_cpu(resp->r_response));
-
- mlog(0, "received response message:\n");
- mlog(0, "h_response_id = %u\n", response_id);
- mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
- mlog(0, "h_blkno = %llu\n",
- (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
- mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
- mlog(0, "h_node_num = %u\n", node_num);
- mlog(0, "r_response = %d\n", response_status);
-
- spin_lock(&osb->net_response_lock);
- w = __ocfs2_find_net_wait_ctxt(osb, response_id);
- if (!w) {
- mlog(0, "request not found!\n");
- goto bail;
- }
- resp_cb = w->n_callback;
-
- if (response_status && (!w->n_response)) {
- /* we only really need one negative response so don't
- * set it twice. */
- w->n_response = response_status;
- }
-
- if (resp_cb) {
- spin_unlock(&osb->net_response_lock);
-
- resp_cb->rc_cb(resp_cb->rc_priv, resp);
-
- spin_lock(&osb->net_response_lock);
- }
-
- __ocfs2_mark_node_responded(osb, w, node_num);
-bail:
- spin_unlock(&osb->net_response_lock);
-
- return 0;
-}
-
-static int ocfs2_handle_vote_message(struct o2net_msg *msg,
- u32 len,
- void *data, void **ret_data)
-{
- int status;
- struct ocfs2_super *osb = data;
- struct ocfs2_vote_work *work;
-
- work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
- if (!work) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- INIT_LIST_HEAD(&work->w_list);
- memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
-
- mlog(0, "scheduling vote request:\n");
- mlog(0, "h_response_id = %u\n",
- be32_to_cpu(work->w_msg.v_hdr.h_response_id));
- mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
- mlog(0, "h_blkno = %llu\n",
- (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
- mlog(0, "h_generation = %u\n",
- be32_to_cpu(work->w_msg.v_hdr.h_generation));
- mlog(0, "h_node_num = %u\n",
- be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-
- spin_lock(&osb->vote_task_lock);
- list_add_tail(&work->w_list, &osb->vote_list);
- osb->vote_count++;
- spin_unlock(&osb->vote_task_lock);
-
- ocfs2_kick_vote_thread(osb);
-
- status = 0;
-bail:
- return status;
-}
-
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
-{
- if (!osb->net_key)
- return;
-
- o2net_unregister_handler_list(&osb->osb_net_handlers);
-
- if (!list_empty(&osb->net_response_list))
- mlog(ML_ERROR, "net response list not empty!\n");
-
- osb->net_key = 0;
-}
-
-int ocfs2_register_net_handlers(struct ocfs2_super *osb)
-{
- int status = 0;
-
- if (ocfs2_mount_local(osb))
- return 0;
-
- status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
- osb->net_key,
- sizeof(struct ocfs2_response_msg),
- ocfs2_handle_response_message,
- osb, NULL, &osb->osb_net_handlers);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
- osb->net_key,
- sizeof(struct ocfs2_vote_msg),
- ocfs2_handle_vote_message,
- osb, NULL, &osb->osb_net_handlers);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-bail:
- if (status < 0)
- ocfs2_unregister_net_handlers(osb);
-
- return status;
-}
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d8817384008..6b7ff161894 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -131,7 +131,7 @@ static void property_stop(struct seq_file *f, void *v)
/* Nothing to do */
}
-static struct seq_operations property_op = {
+static const struct seq_operations property_op = {
.start = property_start,
.next = property_next,
.stop = property_stop,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 722e12e5acc..739da701ae7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,96 +195,45 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
return ERR_PTR(res);
}
-/*
- * sysfs bindings for partitions
- */
-
-struct part_attribute {
- struct attribute attr;
- ssize_t (*show)(struct hd_struct *,char *);
- ssize_t (*store)(struct hd_struct *,const char *, size_t);
-};
-
-static ssize_t
-part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
+static ssize_t part_start_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
- struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
- ssize_t ret = 0;
- if (part_attr->show)
- ret = part_attr->show(p, page);
- return ret;
-}
-static ssize_t
-part_attr_store(struct kobject * kobj, struct attribute * attr,
- const char *page, size_t count)
-{
- struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
- struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
- ssize_t ret = 0;
+ struct hd_struct *p = dev_to_part(dev);
- if (part_attr->store)
- ret = part_attr->store(p, page, count);
- return ret;
+ return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
}
-static struct sysfs_ops part_sysfs_ops = {
- .show = part_attr_show,
- .store = part_attr_store,
-};
-
-static ssize_t part_uevent_store(struct hd_struct * p,
- const char *page, size_t count)
+static ssize_t part_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- kobject_uevent(&p->kobj, KOBJ_ADD);
- return count;
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
}
-static ssize_t part_dev_read(struct hd_struct * p, char *page)
-{
- struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj);
- dev_t dev = MKDEV(disk->major, disk->first_minor + p->partno);
- return print_dev_t(page, dev);
-}
-static ssize_t part_start_read(struct hd_struct * p, char *page)
-{
- return sprintf(page, "%llu\n",(unsigned long long)p->start_sect);
-}
-static ssize_t part_size_read(struct hd_struct * p, char *page)
-{
- return sprintf(page, "%llu\n",(unsigned long long)p->nr_sects);
-}
-static ssize_t part_stat_read(struct hd_struct * p, char *page)
+
+static ssize_t part_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return sprintf(page, "%8u %8llu %8u %8llu\n",
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%8u %8llu %8u %8llu\n",
p->ios[0], (unsigned long long)p->sectors[0],
p->ios[1], (unsigned long long)p->sectors[1]);
}
-static struct part_attribute part_attr_uevent = {
- .attr = {.name = "uevent", .mode = S_IWUSR },
- .store = part_uevent_store
-};
-static struct part_attribute part_attr_dev = {
- .attr = {.name = "dev", .mode = S_IRUGO },
- .show = part_dev_read
-};
-static struct part_attribute part_attr_start = {
- .attr = {.name = "start", .mode = S_IRUGO },
- .show = part_start_read
-};
-static struct part_attribute part_attr_size = {
- .attr = {.name = "size", .mode = S_IRUGO },
- .show = part_size_read
-};
-static struct part_attribute part_attr_stat = {
- .attr = {.name = "stat", .mode = S_IRUGO },
- .show = part_stat_read
-};
#ifdef CONFIG_FAIL_MAKE_REQUEST
+static ssize_t part_fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
-static ssize_t part_fail_store(struct hd_struct * p,
+ return sprintf(buf, "%d\n", p->make_it_fail);
+}
+
+static ssize_t part_fail_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
+ struct hd_struct *p = dev_to_part(dev);
int i;
if (count > 0 && sscanf(buf, "%d", &i) > 0)
@@ -292,50 +241,53 @@ static ssize_t part_fail_store(struct hd_struct * p,
return count;
}
-static ssize_t part_fail_read(struct hd_struct * p, char *page)
-{
- return sprintf(page, "%d\n", p->make_it_fail);
-}
-static struct part_attribute part_attr_fail = {
- .attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR },
- .store = part_fail_store,
- .show = part_fail_read
-};
+#endif
+static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+static struct device_attribute dev_attr_fail =
+ __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
#endif
-static struct attribute * default_attrs[] = {
- &part_attr_uevent.attr,
- &part_attr_dev.attr,
- &part_attr_start.attr,
- &part_attr_size.attr,
- &part_attr_stat.attr,
+static struct attribute *part_attrs[] = {
+ &dev_attr_start.attr,
+ &dev_attr_size.attr,
+ &dev_attr_stat.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
- &part_attr_fail.attr,
+ &dev_attr_fail.attr,
#endif
- NULL,
+ NULL
};
-extern struct kset block_subsys;
+static struct attribute_group part_attr_group = {
+ .attrs = part_attrs,
+};
-static void part_release(struct kobject *kobj)
+static struct attribute_group *part_attr_groups[] = {
+ &part_attr_group,
+ NULL
+};
+
+static void part_release(struct device *dev)
{
- struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
+ struct hd_struct *p = dev_to_part(dev);
kfree(p);
}
-struct kobj_type ktype_part = {
+struct device_type part_type = {
+ .name = "partition",
+ .groups = part_attr_groups,
.release = part_release,
- .default_attrs = default_attrs,
- .sysfs_ops = &part_sysfs_ops,
};
static inline void partition_sysfs_add_subdir(struct hd_struct *p)
{
struct kobject *k;
- k = kobject_get(&p->kobj);
- p->holder_dir = kobject_add_dir(k, "holders");
+ k = kobject_get(&p->dev.kobj);
+ p->holder_dir = kobject_create_and_add("holders", k);
kobject_put(k);
}
@@ -343,15 +295,16 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
{
struct kobject *k;
- k = kobject_get(&disk->kobj);
- disk->holder_dir = kobject_add_dir(k, "holders");
- disk->slave_dir = kobject_add_dir(k, "slaves");
+ k = kobject_get(&disk->dev.kobj);
+ disk->holder_dir = kobject_create_and_add("holders", k);
+ disk->slave_dir = kobject_create_and_add("slaves", k);
kobject_put(k);
}
void delete_partition(struct gendisk *disk, int part)
{
struct hd_struct *p = disk->part[part-1];
+
if (!p)
return;
if (!p->nr_sects)
@@ -361,113 +314,55 @@ void delete_partition(struct gendisk *disk, int part)
p->nr_sects = 0;
p->ios[0] = p->ios[1] = 0;
p->sectors[0] = p->sectors[1] = 0;
- sysfs_remove_link(&p->kobj, "subsystem");
- kobject_unregister(p->holder_dir);
- kobject_uevent(&p->kobj, KOBJ_REMOVE);
- kobject_del(&p->kobj);
- kobject_put(&p->kobj);
+ kobject_put(p->holder_dir);
+ device_del(&p->dev);
+ put_device(&p->dev);
}
void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
{
struct hd_struct *p;
+ int err;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return;
-
+
p->start_sect = start;
p->nr_sects = len;
p->partno = part;
p->policy = disk->policy;
- if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1]))
- kobject_set_name(&p->kobj, "%sp%d",
- kobject_name(&disk->kobj), part);
+ if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
+ snprintf(p->dev.bus_id, BUS_ID_SIZE,
+ "%sp%d", disk->dev.bus_id, part);
else
- kobject_set_name(&p->kobj, "%s%d",
- kobject_name(&disk->kobj),part);
- p->kobj.parent = &disk->kobj;
- p->kobj.ktype = &ktype_part;
- kobject_init(&p->kobj);
- kobject_add(&p->kobj);
- if (!disk->part_uevent_suppress)
- kobject_uevent(&p->kobj, KOBJ_ADD);
- sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
+ snprintf(p->dev.bus_id, BUS_ID_SIZE,
+ "%s%d", disk->dev.bus_id, part);
+
+ device_initialize(&p->dev);
+ p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
+ p->dev.class = &block_class;
+ p->dev.type = &part_type;
+ p->dev.parent = &disk->dev;
+ disk->part[part-1] = p;
+
+ /* delay uevent until 'holders' subdir is created */
+ p->dev.uevent_suppress = 1;
+ device_add(&p->dev);
+ partition_sysfs_add_subdir(p);
+ p->dev.uevent_suppress = 0;
if (flags & ADDPART_FLAG_WHOLEDISK) {
static struct attribute addpartattr = {
.name = "whole_disk",
.mode = S_IRUSR | S_IRGRP | S_IROTH,
};
-
- sysfs_create_file(&p->kobj, &addpartattr);
+ err = sysfs_create_file(&p->dev.kobj, &addpartattr);
}
- partition_sysfs_add_subdir(p);
- disk->part[part-1] = p;
-}
-static char *make_block_name(struct gendisk *disk)
-{
- char *name;
- static char *block_str = "block:";
- int size;
- char *s;
-
- size = strlen(block_str) + strlen(disk->disk_name) + 1;
- name = kmalloc(size, GFP_KERNEL);
- if (!name)
- return NULL;
- strcpy(name, block_str);
- strcat(name, disk->disk_name);
- /* ewww... some of these buggers have / in name... */
- s = strchr(name, '/');
- if (s)
- *s = '!';
- return name;
-}
-
-static int disk_sysfs_symlinks(struct gendisk *disk)
-{
- struct device *target = get_device(disk->driverfs_dev);
- int err;
- char *disk_name = NULL;
-
- if (target) {
- disk_name = make_block_name(disk);
- if (!disk_name) {
- err = -ENOMEM;
- goto err_out;
- }
-
- err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
- if (err)
- goto err_out_disk_name;
-
- err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
- if (err)
- goto err_out_dev_link;
- }
-
- err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
- "subsystem");
- if (err)
- goto err_out_disk_name_lnk;
-
- kfree(disk_name);
-
- return 0;
-
-err_out_disk_name_lnk:
- if (target) {
- sysfs_remove_link(&target->kobj, disk_name);
-err_out_dev_link:
- sysfs_remove_link(&disk->kobj, "device");
-err_out_disk_name:
- kfree(disk_name);
-err_out:
- put_device(target);
- }
- return err;
+ /* suppress uevent if the disk supresses it */
+ if (!disk->dev.uevent_suppress)
+ kobject_uevent(&p->dev.kobj, KOBJ_ADD);
}
/* Not exported, helper to add_disk(). */
@@ -479,19 +374,29 @@ void register_disk(struct gendisk *disk)
struct hd_struct *p;
int err;
- kobject_set_name(&disk->kobj, "%s", disk->disk_name);
- /* ewww... some of these buggers have / in name... */
- s = strchr(disk->kobj.k_name, '/');
+ disk->dev.parent = disk->driverfs_dev;
+ disk->dev.devt = MKDEV(disk->major, disk->first_minor);
+
+ strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+ /* ewww... some of these buggers have / in the name... */
+ s = strchr(disk->dev.bus_id, '/');
if (s)
*s = '!';
- if ((err = kobject_add(&disk->kobj)))
+
+ /* delay uevents, until we scanned partition table */
+ disk->dev.uevent_suppress = 1;
+
+ if (device_add(&disk->dev))
return;
- err = disk_sysfs_symlinks(disk);
+#ifndef CONFIG_SYSFS_DEPRECATED
+ err = sysfs_create_link(block_depr, &disk->dev.kobj,
+ kobject_name(&disk->dev.kobj));
if (err) {
- kobject_del(&disk->kobj);
+ device_del(&disk->dev);
return;
}
- disk_sysfs_add_subdirs(disk);
+#endif
+ disk_sysfs_add_subdirs(disk);
/* No minors to use for partitions */
if (disk->minors == 1)
@@ -505,25 +410,23 @@ void register_disk(struct gendisk *disk)
if (!bdev)
goto exit;
- /* scan partition table, but suppress uevents */
bdev->bd_invalidated = 1;
- disk->part_uevent_suppress = 1;
err = blkdev_get(bdev, FMODE_READ, 0);
- disk->part_uevent_suppress = 0;
if (err < 0)
goto exit;
blkdev_put(bdev);
exit:
- /* announce disk after possible partitions are already created */
- kobject_uevent(&disk->kobj, KOBJ_ADD);
+ /* announce disk after possible partitions are created */
+ disk->dev.uevent_suppress = 0;
+ kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
/* announce possible partitions */
for (i = 1; i < disk->minors; i++) {
p = disk->part[i-1];
if (!p || !p->nr_sects)
continue;
- kobject_uevent(&p->kobj, KOBJ_ADD);
+ kobject_uevent(&p->dev.kobj, KOBJ_ADD);
}
}
@@ -602,19 +505,11 @@ void del_gendisk(struct gendisk *disk)
disk_stat_set_all(disk, 0);
disk->stamp = 0;
- kobject_uevent(&disk->kobj, KOBJ_REMOVE);
- kobject_unregister(disk->holder_dir);
- kobject_unregister(disk->slave_dir);
- if (disk->driverfs_dev) {
- char *disk_name = make_block_name(disk);
- sysfs_remove_link(&disk->kobj, "device");
- if (disk_name) {
- sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
- kfree(disk_name);
- }
- put_device(disk->driverfs_dev);
- disk->driverfs_dev = NULL;
- }
- sysfs_remove_link(&disk->kobj, "subsystem");
- kobject_del(&disk->kobj);
+ kobject_put(disk->holder_dir);
+ kobject_put(disk->slave_dir);
+ disk->driverfs_dev = NULL;
+#ifndef CONFIG_SYSFS_DEPRECATED
+ sysfs_remove_link(block_depr, disk->dev.bus_id);
+#endif
+ device_del(&disk->dev);
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65c62e1bfd6..eb97f2897e2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -169,7 +169,7 @@ static inline char *task_state(struct task_struct *p, char *buffer)
ppid = pid_alive(p) ?
task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
tpid = pid_alive(p) && p->ptrace ?
- task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+ task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
buffer += sprintf(buffer,
"State:\t%s\n"
"Tgid:\t%d\n"
@@ -464,8 +464,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
}
sid = task_session_nr_ns(task, ns);
+ ppid = task_tgid_nr_ns(task->real_parent, ns);
pgid = task_pgrp_nr_ns(task, ns);
- ppid = task_ppid_nr_ns(task, ns);
unlock_task_sighand(task, &flags);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7411bfb0b7c..91fa8e6ce8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
}
#endif
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+ int i;
+ struct task_struct *task = m->private;
+ seq_puts(m, "Latency Top version : v0.1\n");
+
+ for (i = 0; i < 32; i++) {
+ if (task->latency_record[i].backtrace[0]) {
+ int q;
+ seq_printf(m, "%i %li %li ",
+ task->latency_record[i].count,
+ task->latency_record[i].time,
+ task->latency_record[i].max);
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ char sym[KSYM_NAME_LEN];
+ char *c;
+ if (!task->latency_record[i].backtrace[q])
+ break;
+ if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+ break;
+ sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+ c = strchr(sym, '+');
+ if (c)
+ *c = 0;
+ seq_printf(m, "%s ", sym);
+ }
+ seq_printf(m, "\n");
+ }
+
+ }
+ return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ struct task_struct *task = get_proc_task(inode);
+
+ ret = single_open(file, lstats_show_proc, NULL);
+ if (!ret) {
+ m = file->private_data;
+ m->private = task;
+ }
+ return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ struct seq_file *m;
+ struct task_struct *task;
+
+ m = file->private_data;
+ task = m->private;
+ clear_all_latency_tracing(task);
+
+ return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+ .open = lstats_open,
+ .read = seq_read,
+ .write = lstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
/* The badness from the OOM killer */
unsigned long badness(struct task_struct *p, unsigned long uptime);
static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
};
#endif
+
#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
+#ifdef CONFIG_LATENCYTOP
+ REG("latency", S_IRUGO, lstats),
+#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
+#ifdef CONFIG_LATENCYTOP
+ REG("latency", S_IRUGO, lstats),
+#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94cc722..c4d3d17923f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -197,25 +197,27 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
{
struct inode *inode;
loff_t pos;
+ int retval = -EINVAL;
inode = file->f_path.dentry->d_inode;
if (unlikely((ssize_t) count < 0))
- goto Einval;
+ return retval;
pos = *ppos;
if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
- goto Einval;
+ return retval;
if (unlikely(inode->i_flock && mandatory_lock(inode))) {
- int retval = locks_mandatory_area(
+ retval = locks_mandatory_area(
read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
inode, file, pos, count);
if (retval < 0)
return retval;
}
+ retval = security_file_permission(file,
+ read_write == READ ? MAY_READ : MAY_WRITE);
+ if (retval)
+ return retval;
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
-
-Einval:
- return -EINVAL;
}
static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
@@ -267,18 +269,15 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
- ret = security_file_permission (file, MAY_READ);
- if (!ret) {
- if (file->f_op->read)
- ret = file->f_op->read(file, buf, count, pos);
- else
- ret = do_sync_read(file, buf, count, pos);
- if (ret > 0) {
- fsnotify_access(file->f_path.dentry);
- add_rchar(current, ret);
- }
- inc_syscr(current);
+ if (file->f_op->read)
+ ret = file->f_op->read(file, buf, count, pos);
+ else
+ ret = do_sync_read(file, buf, count, pos);
+ if (ret > 0) {
+ fsnotify_access(file->f_path.dentry);
+ add_rchar(current, ret);
}
+ inc_syscr(current);
}
return ret;
@@ -325,18 +324,15 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
- ret = security_file_permission (file, MAY_WRITE);
- if (!ret) {
- if (file->f_op->write)
- ret = file->f_op->write(file, buf, count, pos);
- else
- ret = do_sync_write(file, buf, count, pos);
- if (ret > 0) {
- fsnotify_modify(file->f_path.dentry);
- add_wchar(current, ret);
- }
- inc_syscw(current);
+ if (file->f_op->write)
+ ret = file->f_op->write(file, buf, count, pos);
+ else
+ ret = do_sync_write(file, buf, count, pos);
+ if (ret > 0) {
+ fsnotify_modify(file->f_path.dentry);
+ add_wchar(current, ret);
}
+ inc_syscw(current);
}
return ret;
@@ -603,9 +599,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
- ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
- if (ret)
- goto out;
fnv = NULL;
if (type == READ) {
@@ -737,10 +730,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
goto fput_in;
count = retval;
- retval = security_file_permission (in_file, MAY_READ);
- if (retval)
- goto fput_in;
-
/*
* Get output file, and verify that it is ok..
*/
@@ -759,10 +748,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
goto fput_out;
count = retval;
- retval = security_file_permission (out_file, MAY_WRITE);
- if (retval)
- goto fput_out;
-
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
diff --git a/fs/splice.c b/fs/splice.c
index 6bdcb6107bc..56b802bfbfa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -908,10 +908,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
if (unlikely(ret < 0))
return ret;
- ret = security_file_permission(out, MAY_WRITE);
- if (unlikely(ret < 0))
- return ret;
-
return out->f_op->splice_write(pipe, out, ppos, len, flags);
}
@@ -934,10 +930,6 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
- ret = security_file_permission(in, MAY_READ);
- if (unlikely(ret < 0))
- return ret;
-
return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 337162935d2..4948d9bc405 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -440,7 +440,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
/**
* sysfs_remove_one - remove sysfs_dirent from parent
* @acxt: addrm context to use
- * @sd: sysfs_dirent to be added
+ * @sd: sysfs_dirent to be removed
*
* Mark @sd removed and drop nlink of parent inode if @sd is a
* directory. @sd is unlinked from the children list.
@@ -678,8 +678,10 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
/* no such entry */
- if (!sd)
+ if (!sd) {
+ ret = ERR_PTR(-ENOENT);
goto out_unlock;
+ }
/* attach dentry and inode */
inode = sysfs_get_inode(sd);
@@ -781,6 +783,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
old_dentry = sysfs_get_dentry(sd);
if (IS_ERR(old_dentry)) {
error = PTR_ERR(old_dentry);
+ old_dentry = NULL;
goto out;
}
@@ -848,6 +851,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
old_dentry = sysfs_get_dentry(sd);
if (IS_ERR(old_dentry)) {
error = PTR_ERR(old_dentry);
+ old_dentry = NULL;
goto out;
}
old_parent = old_dentry->d_parent;
@@ -855,6 +859,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
new_parent = sysfs_get_dentry(new_parent_sd);
if (IS_ERR(new_parent)) {
error = PTR_ERR(new_parent);
+ new_parent = NULL;
goto out;
}
@@ -878,7 +883,6 @@ again:
error = 0;
d_add(new_dentry, NULL);
d_move(old_dentry, new_dentry);
- dput(new_dentry);
/* Remove from old parent's list and insert into new parent's list. */
sysfs_unlink_sibling(sd);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b834f1709f9..a271c87c447 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,43 +20,6 @@
#include "sysfs.h"
-#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
-
-/*
- * Subsystem file operations.
- * These operations allow subsystems to have files that can be
- * read/written.
- */
-static ssize_t
-subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
-{
- struct kset *kset = to_kset(kobj);
- struct subsys_attribute * sattr = to_sattr(attr);
- ssize_t ret = -EIO;
-
- if (sattr->show)
- ret = sattr->show(kset, page);
- return ret;
-}
-
-static ssize_t
-subsys_attr_store(struct kobject * kobj, struct attribute * attr,
- const char * page, size_t count)
-{
- struct kset *kset = to_kset(kobj);
- struct subsys_attribute * sattr = to_sattr(attr);
- ssize_t ret = -EIO;
-
- if (sattr->store)
- ret = sattr->store(kset, page, count);
- return ret;
-}
-
-static struct sysfs_ops subsys_sysfs_ops = {
- .show = subsys_attr_show,
- .store = subsys_attr_store,
-};
-
/*
* There's one sysfs_buffer for each open file and one
* sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -66,7 +29,7 @@ static struct sysfs_ops subsys_sysfs_ops = {
* sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open
* is protected by sysfs_open_dirent_lock.
*/
-static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
struct sysfs_open_dirent {
atomic_t refcnt;
@@ -354,31 +317,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
{
struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- struct sysfs_buffer * buffer;
- struct sysfs_ops * ops = NULL;
- int error;
+ struct sysfs_buffer *buffer;
+ struct sysfs_ops *ops;
+ int error = -EACCES;
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active_two(attr_sd))
return -ENODEV;
- /* if the kobject has no ktype, then we assume that it is a subsystem
- * itself, and use ops for it.
- */
- if (kobj->kset && kobj->kset->ktype)
- ops = kobj->kset->ktype->sysfs_ops;
- else if (kobj->ktype)
+ /* every kobject with an attribute needs a ktype assigned */
+ if (kobj->ktype && kobj->ktype->sysfs_ops)
ops = kobj->ktype->sysfs_ops;
- else
- ops = &subsys_sysfs_ops;
-
- error = -EACCES;
-
- /* No sysfs operations, either from having no subsystem,
- * or the subsystem have no operations.
- */
- if (!ops)
+ else {
+ printk(KERN_ERR "missing sysfs attribute operations for "
+ "kobject: %s\n", kobject_name(kobj));
+ WARN_ON(1);
goto err_out;
+ }
/* File needs write support.
* The inode's perms must say it's ok,
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3eac20c63c4..5f66c446615 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,39 +19,6 @@
#include "sysfs.h"
-static int object_depth(struct sysfs_dirent *sd)
-{
- int depth = 0;
-
- for (; sd->s_parent; sd = sd->s_parent)
- depth++;
-
- return depth;
-}
-
-static int object_path_length(struct sysfs_dirent * sd)
-{
- int length = 1;
-
- for (; sd->s_parent; sd = sd->s_parent)
- length += strlen(sd->s_name) + 1;
-
- return length;
-}
-
-static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length)
-{
- --length;
- for (; sd->s_parent; sd = sd->s_parent) {
- int cur = strlen(sd->s_name);
-
- /* back up enough to print this bus id with '/' */
- length -= cur;
- strncpy(buffer + length, sd->s_name, cur);
- *(buffer + --length) = '/';
- }
-}
-
/**
* sysfs_create_link - create symlink between two objects.
* @kobj: object whose directory we're creating the link in.
@@ -112,7 +79,6 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
return error;
}
-
/**
* sysfs_remove_link - remove symlink in object's directory.
* @kobj: object we're acting for.
@@ -124,24 +90,54 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
sysfs_hash_and_remove(kobj->sd, name);
}
-static int sysfs_get_target_path(struct sysfs_dirent * parent_sd,
- struct sysfs_dirent * target_sd, char *path)
+static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
+ struct sysfs_dirent *target_sd, char *path)
{
- char * s;
- int depth, size;
+ struct sysfs_dirent *base, *sd;
+ char *s = path;
+ int len = 0;
+
+ /* go up to the root, stop at the base */
+ base = parent_sd;
+ while (base->s_parent) {
+ sd = target_sd->s_parent;
+ while (sd->s_parent && base != sd)
+ sd = sd->s_parent;
+
+ if (base == sd)
+ break;
+
+ strcpy(s, "../");
+ s += 3;
+ base = base->s_parent;
+ }
+
+ /* determine end of target string for reverse fillup */
+ sd = target_sd;
+ while (sd->s_parent && sd != base) {
+ len += strlen(sd->s_name) + 1;
+ sd = sd->s_parent;
+ }
- depth = object_depth(parent_sd);
- size = object_path_length(target_sd) + depth * 3 - 1;
- if (size > PATH_MAX)
+ /* check limits */
+ if (len < 2)
+ return -EINVAL;
+ len--;
+ if ((s - path) + len > PATH_MAX)
return -ENAMETOOLONG;
- pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+ /* reverse fillup of target string from target to base */
+ sd = target_sd;
+ while (sd->s_parent && sd != base) {
+ int slen = strlen(sd->s_name);
- for (s = path; depth--; s += 3)
- strcpy(s,"../");
+ len -= slen;
+ strncpy(s + len, sd->s_name, slen);
+ if (len)
+ s[--len] = '/';
- fill_object_path(target_sd, path, size);
- pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+ sd = sd->s_parent;
+ }
return 0;
}