aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/Makefile5
-rw-r--r--fs/adfs/super.c16
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Kconfig8
-rw-r--r--fs/afs/Makefile3
-rw-r--r--fs/afs/cache.c503
-rw-r--r--fs/afs/cache.h15
-rw-r--r--fs/afs/cell.c16
-rw-r--r--fs/afs/file.c220
-rw-r--r--fs/afs/inode.c31
-rw-r--r--fs/afs/internal.h53
-rw-r--r--fs/afs/main.c27
-rw-r--r--fs/afs/mntpt.c4
-rw-r--r--fs/afs/vlocation.c25
-rw-r--r--fs/afs/volume.c14
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c29
-rw-r--r--fs/autofs4/expire.c27
-rw-r--r--fs/autofs4/root.c41
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c25
-rw-r--r--fs/binfmt_som.c7
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c588
-rw-r--r--fs/btrfs/ctree.h71
-rw-r--r--fs/btrfs/delayed-ref.c669
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c1674
-rw-r--r--fs/btrfs/extent_io.c51
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c206
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/locking.c21
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c444
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/buffer.c63
-rw-r--r--fs/cachefiles/Kconfig39
-rw-r--r--fs/cachefiles/Makefile18
-rw-r--r--fs/cachefiles/bind.c286
-rw-r--r--fs/cachefiles/daemon.c755
-rw-r--r--fs/cachefiles/interface.c449
-rw-r--r--fs/cachefiles/internal.h360
-rw-r--r--fs/cachefiles/key.c159
-rw-r--r--fs/cachefiles/main.c106
-rw-r--r--fs/cachefiles/namei.c771
-rw-r--r--fs/cachefiles/proc.c134
-rw-r--r--fs/cachefiles/rdwr.c879
-rw-r--r--fs/cachefiles/security.c116
-rw-r--r--fs/cachefiles/xattr.c291
-rw-r--r--fs/cifs/dir.c4
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/compat.c105
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/cramfs/inode.c39
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/keystore.c3
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efs/super.c20
-rw-r--r--fs/eventfd.c26
-rw-r--r--fs/eventpoll.c614
-rw-r--r--fs/exec.c35
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild16
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/common.h184
-rw-r--r--fs/exofs/dir.c672
-rw-r--r--fs/exofs/exofs.h180
-rw-r--r--fs/exofs/file.c87
-rw-r--r--fs/exofs/inode.c1303
-rw-r--r--fs/exofs/namei.c342
-rw-r--r--fs/exofs/osd.c153
-rw-r--r--fs/exofs/super.c584
-rw-r--r--fs/exofs/symlink.c57
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/file.c2
-rw-r--r--fs/ext3/inode.c139
-rw-r--r--fs/ext3/ioctl.c59
-rw-r--r--fs/ext3/namei.c29
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h93
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c127
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c273
-rw-r--r--fs/ext4/inode.c429
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c327
-rw-r--r--fs/fat/inode.c8
-rw-r--r--fs/fs-writeback.c29
-rw-r--r--fs/fs_struct.c177
-rw-r--r--fs/fscache/Kconfig56
-rw-r--r--fs/fscache/Makefile19
-rw-r--r--fs/fscache/cache.c415
-rw-r--r--fs/fscache/cookie.c500
-rw-r--r--fs/fscache/fsdef.c144
-rw-r--r--fs/fscache/histogram.c109
-rw-r--r--fs/fscache/internal.h380
-rw-r--r--fs/fscache/main.c124
-rw-r--r--fs/fscache/netfs.c103
-rw-r--r--fs/fscache/object.c810
-rw-r--r--fs/fscache/operation.c459
-rw-r--r--fs/fscache/page.c816
-rw-r--r--fs/fscache/proc.c68
-rw-r--r--fs/fscache/stats.c212
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/ops_file.c5
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hpfs/super.c5
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--fs/internal.h8
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/jbd/journal.c34
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/mon.c8
-rw-r--r--fs/lockd/svc.c42
-rw-r--r--fs/minix/inode.c11
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c14
-rw-r--r--fs/namespace.c61
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/callback.c31
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/client.c130
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/file.c75
-rw-r--r--fs/nfs/fscache-index.c337
-rw-r--r--fs/nfs/fscache.c523
-rw-r--r--fs/nfs/fscache.h220
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c323
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/iostat.h18
-rw-r--r--fs/nfs/nfs2xdr.c9
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs3xdr.c37
-rw-r--r--fs/nfs/nfs4proc.c49
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c213
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c27
-rw-r--r--fs/nfs/super.c49
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c12
-rw-r--r--fs/ntfs/dir.c4
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/layout.h329
-rw-r--r--fs/ntfs/logfile.h6
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ntfs/super.c50
-rw-r--r--fs/ntfs/usnjrnl.h48
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/omfs/inode.c7
-rw-r--r--fs/open.c1
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/proc_tty.c12
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/file-nommu.c15
-rw-r--r--fs/ramfs/inode.c94
-rw-r--r--fs/read_write.c50
-rw-r--r--fs/reiserfs/Kconfig1
-rw-r--r--fs/reiserfs/super.c5
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/squashfs/super.c3
-rw-r--r--fs/super.c1
-rw-r--r--fs/sysfs/bin.c8
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/udf/balloc.c150
-rw-r--r--fs/udf/dir.c14
-rw-r--r--fs/udf/directory.c38
-rw-r--r--fs/udf/ecma_167.h416
-rw-r--r--fs/udf/ialloc.c9
-rw-r--r--fs/udf/inode.c213
-rw-r--r--fs/udf/misc.c29
-rw-r--r--fs/udf/namei.c86
-rw-r--r--fs/udf/osta_udf.h22
-rw-r--r--fs/udf/partition.c2
-rw-r--r--fs/udf/super.c605
-rw-r--r--fs/udf/truncate.c44
-rw-r--r--fs/udf/udf_i.h6
-rw-r--r--fs/udf/udf_sb.h9
-rw-r--r--fs/udf/udfdecl.h57
-rw-r--r--fs/udf/udfend.h28
-rw-r--r--fs/udf/udftime.c6
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/mutex.h25
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c107
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c37
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h13
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c157
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c137
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h32
-rw-r--r--fs/xfs/quota/xfs_dquot.c28
-rw-r--r--fs/xfs/quota/xfs_dquot.h18
-rw-r--r--fs/xfs/quota/xfs_qm.c212
-rw-r--r--fs/xfs/quota/xfs_qm.h26
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c190
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h40
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c16
-rw-r--r--fs/xfs/support/debug.c1
-rw-r--r--fs/xfs/support/uuid.c71
-rw-r--r--fs/xfs/support/uuid.h4
-rw-r--r--fs/xfs/xfs_ag.h4
-rw-r--r--fs/xfs/xfs_alloc.c26
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_attr_leaf.c58
-rw-r--r--fs/xfs/xfs_bmap.c76
-rw-r--r--fs/xfs/xfs_bmap.h6
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/xfs_da_btree.h9
-rw-r--r--fs/xfs/xfs_dfrag.c68
-rw-r--r--fs/xfs/xfs_dinode.h4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_block.c7
-rw-r--r--fs/xfs/xfs_dir2_data.h2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c17
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_sf.c13
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_filestream.c9
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c12
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.h22
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_itable.c9
-rw-r--r--fs/xfs/xfs_log.c67
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c308
-rw-r--r--fs/xfs/xfs_mount.c253
-rw-r--r--fs/xfs/xfs_mount.h19
-rw-r--r--fs/xfs/xfs_qmops.c1
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rtalloc.h8
-rw-r--r--fs/xfs/xfs_trans.h24
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_item.c2
-rw-r--r--fs/xfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_types.h8
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c408
-rw-r--r--fs/xfs/xfs_vnodeops.h3
302 files changed, 21924 insertions, 6428 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa..86b203fc3c5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -66,6 +66,13 @@ config GENERIC_ACL
bool
select FS_POSIX_ACL
+menu "Caches"
+
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+
+endmenu
+
if BLOCK
menu "CD-ROM/DVD Filesystems"
@@ -169,6 +176,8 @@ source "fs/romfs/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
+source "fs/exofs/Kconfig"
+
endif # MISC_FILESYSTEMS
menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd..70b2aed8713 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o
+ stack.o fs_struct.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING) += dcookies.o
obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
obj-$(CONFIG_EXT2_FS) += ext2/
@@ -116,7 +117,9 @@ obj-$(CONFIG_AFS_FS) += afs/
obj-$(CONFIG_BEFS_FS) += befs/
obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_HPPFS) += hppfs/
+obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_EXOFS_FS) += exofs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7f83a46f2b7..dd9becca424 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -219,16 +219,20 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
+ struct super_block *sb = dentry->d_sb;
+ struct adfs_sb_info *sbi = ADFS_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = ADFS_SUPER_MAGIC;
- buf->f_namelen = asb->s_namelen;
- buf->f_bsize = dentry->d_sb->s_blocksize;
- buf->f_blocks = asb->s_size;
- buf->f_files = asb->s_ids_per_zone * asb->s_map_size;
+ buf->f_namelen = sbi->s_namelen;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = sbi->s_size;
+ buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size;
buf->f_bavail =
- buf->f_bfree = adfs_map_free(dentry->d_sb);
+ buf->f_bfree = adfs_map_free(sb);
buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
return 0;
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a19d64b582a..5ce695e707f 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -533,6 +533,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
int free;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
AFFS_SB(sb)->s_reserved);
@@ -543,6 +544,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved;
buf->f_bfree = free;
buf->f_bavail = free;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_namelen = 30;
return 0;
}
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index e7b522fe15e..5c4e61d3c77 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -19,3 +19,11 @@ config AFS_DEBUG
See <file:Documentation/filesystems/afs.txt> for more information.
If unsure, say N.
+
+config AFS_FSCACHE
+ bool "Provide AFS client caching support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+ help
+ Say Y here if you want AFS data to be cached locally on disk through
+ the generic filesystem cache manager
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index a66671082cf..4f64b95d57b 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,7 +2,10 @@
# Makefile for Red Hat Linux AFS client.
#
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
+
kafs-objs := \
+ $(afs-cache-y) \
callback.o \
cell.o \
cmservice.o \
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index de0d7de69ed..e2b1d3f1651 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -1,6 +1,6 @@
/* AFS caching stuff
*
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -9,248 +9,395 @@
* 2 of the License, or (at your option) any later version.
*/
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
- const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_cache_cell_index_def = {
- .name = "cell_ix",
- .data_size = sizeof(struct afs_cache_cell),
- .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
- .match = afs_cell_cache_match,
- .update = afs_cell_cache_update,
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen);
+
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+ void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+ uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+
+struct fscache_netfs afs_cache_netfs = {
+ .name = "afs",
+ .version = 0,
+};
+
+struct fscache_cookie_def afs_cell_cache_index_def = {
+ .name = "AFS.cell",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = afs_cell_cache_get_key,
+ .get_aux = afs_cell_cache_get_aux,
+ .check_aux = afs_cell_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+ .name = "AFS.vldb",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = afs_vlocation_cache_get_key,
+ .get_aux = afs_vlocation_cache_get_aux,
+ .check_aux = afs_vlocation_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_volume_cache_index_def = {
+ .name = "AFS.volume",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = afs_volume_cache_get_key,
+};
+
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+ .name = "AFS.vnode",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = afs_vnode_cache_get_key,
+ .get_attr = afs_vnode_cache_get_attr,
+ .get_aux = afs_vnode_cache_get_aux,
+ .check_aux = afs_vnode_cache_check_aux,
+ .now_uncached = afs_vnode_cache_now_uncached,
};
-#endif
/*
- * match a cell record obtained from the cache
+ * set the key for the index entry
*/
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
- const void *entry)
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
{
- const struct afs_cache_cell *ccell = entry;
- struct afs_cell *cell = target;
+ const struct afs_cell *cell = cookie_netfs_data;
+ uint16_t klen;
- _enter("{%s},{%s}", ccell->name, cell->name);
+ _enter("%p,%p,%u", cell, buffer, bufmax);
- if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
- }
+ klen = strlen(cell->name);
+ if (klen > bufmax)
+ return 0;
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
+ memcpy(buffer, cell->name, klen);
+ return klen;
}
-#endif
/*
- * update a cell record in the cache
+ * provide new auxilliary cache data
*/
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
{
- struct afs_cache_cell *ccell = entry;
- struct afs_cell *cell = source;
+ const struct afs_cell *cell = cookie_netfs_data;
+ uint16_t dlen;
- _enter("%p,%p", source, entry);
+ _enter("%p,%p,%u", cell, buffer, bufmax);
- strncpy(ccell->name, cell->name, sizeof(ccell->name));
+ dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+ dlen = min(dlen, bufmax);
+ dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
- memcpy(ccell->vl_servers,
- cell->vl_addrs,
- min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+ memcpy(buffer, cell->vl_addrs, dlen);
+ return dlen;
+}
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen)
+{
+ _leave(" = OKAY");
+ return FSCACHE_CHECKAUX_OKAY;
}
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
- const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vlocation_cache_index_def = {
- .name = "vldb",
- .data_size = sizeof(struct afs_cache_vlocation),
- .keys[0] = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
- .match = afs_vlocation_cache_match,
- .update = afs_vlocation_cache_update,
-};
-#endif
+/*****************************************************************************/
/*
- * match a VLDB record stored in the cache
- * - may also load target from entry
+ * set the key for the index entry
*/
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
- const void *entry)
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
{
- const struct afs_cache_vlocation *vldb = entry;
- struct afs_vlocation *vlocation = target;
+ const struct afs_vlocation *vlocation = cookie_netfs_data;
+ uint16_t klen;
+
+ _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+ klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+ if (klen > bufmax)
+ return 0;
- _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+ memcpy(buffer, vlocation->vldb.name, klen);
- if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
- ) {
- if (!vlocation->valid ||
- vlocation->vldb.rtime == vldb->rtime
+ _leave(" = %u", klen);
+ return klen;
+}
+
+/*
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct afs_vlocation *vlocation = cookie_netfs_data;
+ uint16_t dlen;
+
+ _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+ dlen = sizeof(struct afs_cache_vlocation);
+ dlen -= offsetof(struct afs_cache_vlocation, nservers);
+ if (dlen > bufmax)
+ return 0;
+
+ memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+
+ _leave(" = %u", dlen);
+ return dlen;
+}
+
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen)
+{
+ const struct afs_cache_vlocation *cvldb;
+ struct afs_vlocation *vlocation = cookie_netfs_data;
+ uint16_t dlen;
+
+ _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+
+ /* check the size of the data is what we're expecting */
+ dlen = sizeof(struct afs_cache_vlocation);
+ dlen -= offsetof(struct afs_cache_vlocation, nservers);
+ if (dlen != buflen)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+
+ /* if what's on disk is more valid than what's in memory, then use the
+ * VL record from the cache */
+ if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+ memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+ vlocation->valid = 1;
+ _leave(" = SUCCESS [c->m]");
+ return FSCACHE_CHECKAUX_OKAY;
+ }
+
+ /* need to update the cache if the cached info differs */
+ if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+ /* delete if the volume IDs for this name differ */
+ if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+ sizeof(cvldb->vid)) != 0
) {
- vlocation->vldb = *vldb;
- vlocation->valid = 1;
- _leave(" = SUCCESS [c->m]");
- return CACHEFS_MATCH_SUCCESS;
- } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
- /* delete if VIDs for this name differ */
- if (memcmp(&vlocation->vldb.vid,
- &vldb->vid,
- sizeof(vldb->vid)) != 0) {
- _leave(" = DELETE");
- return CACHEFS_MATCH_SUCCESS_DELETE;
- }
-
- _leave(" = UPDATE");
- return CACHEFS_MATCH_SUCCESS_UPDATE;
- } else {
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
+ _leave(" = OBSOLETE");
+ return FSCACHE_CHECKAUX_OBSOLETE;
}
+
+ _leave(" = UPDATE");
+ return FSCACHE_CHECKAUX_NEEDS_UPDATE;
}
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
+ _leave(" = OKAY");
+ return FSCACHE_CHECKAUX_OKAY;
}
-#endif
+/*****************************************************************************/
/*
- * update a VLDB record stored in the cache
+ * set the key for the volume index entry
*/
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
{
- struct afs_cache_vlocation *vldb = entry;
- struct afs_vlocation *vlocation = source;
+ const struct afs_volume *volume = cookie_netfs_data;
+ uint16_t klen;
+
+ _enter("{%u},%p,%u", volume->type, buffer, bufmax);
+
+ klen = sizeof(volume->type);
+ if (klen > bufmax)
+ return 0;
- _enter("");
+ memcpy(buffer, &volume->type, sizeof(volume->type));
+
+ _leave(" = %u", klen);
+ return klen;
- *vldb = vlocation->vldb;
}
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
- const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_volume_cache_index_def = {
- .name = "volume",
- .data_size = sizeof(struct afs_cache_vhash),
- .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 1 },
- .keys[1] = { CACHEFS_INDEX_KEYS_BIN, 1 },
- .match = afs_volume_cache_match,
- .update = afs_volume_cache_update,
-};
-#endif
+/*****************************************************************************/
/*
- * match a volume hash record stored in the cache
+ * set the key for the index entry
*/
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
- const void *entry)
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
{
- const struct afs_cache_vhash *vhash = entry;
- struct afs_volume *volume = target;
+ const struct afs_vnode *vnode = cookie_netfs_data;
+ uint16_t klen;
- _enter("{%u},{%u}", volume->type, vhash->vtype);
+ _enter("{%x,%x,%llx},%p,%u",
+ vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+ buffer, bufmax);
- if (volume->type == vhash->vtype) {
- _leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
- }
+ klen = sizeof(vnode->fid.vnode);
+ if (klen > bufmax)
+ return 0;
+
+ memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
+ _leave(" = %u", klen);
+ return klen;
}
-#endif
/*
- * update a volume hash record stored in the cache
+ * provide updated file attributes
*/
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
{
- struct afs_cache_vhash *vhash = entry;
- struct afs_volume *volume = source;
+ const struct afs_vnode *vnode = cookie_netfs_data;
- _enter("");
+ _enter("{%x,%x,%llx},",
+ vnode->fid.vnode, vnode->fid.unique,
+ vnode->status.data_version);
- vhash->vtype = volume->type;
+ *size = vnode->status.size;
}
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
- const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vnode_cache_index_def = {
- .name = "vnode",
- .data_size = sizeof(struct afs_cache_vnode),
- .keys[0] = { CACHEFS_INDEX_KEYS_BIN, 4 },
- .match = afs_vnode_cache_match,
- .update = afs_vnode_cache_update,
-};
-#endif
/*
- * match a vnode record stored in the cache
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct afs_vnode *vnode = cookie_netfs_data;
+ uint16_t dlen;
+
+ _enter("{%x,%x,%Lx},%p,%u",
+ vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+ buffer, bufmax);
+
+ dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+ if (dlen > bufmax)
+ return 0;
+
+ memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+ buffer += sizeof(vnode->fid.unique);
+ memcpy(buffer, &vnode->status.data_version,
+ sizeof(vnode->status.data_version));
+
+ _leave(" = %u", dlen);
+ return dlen;
+}
+
+/*
+ * check that the auxilliary data indicates that the entry is still valid
*/
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
- const void *entry)
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+ const void *buffer,
+ uint16_t buflen)
{
- const struct afs_cache_vnode *cvnode = entry;
- struct afs_vnode *vnode = target;
-
- _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
- vnode->fid.vnode,
- vnode->fid.unique,
- vnode->status.version,
- cvnode->vnode_id,
- cvnode->vnode_unique,
- cvnode->data_version);
-
- if (vnode->fid.vnode != cvnode->vnode_id) {
- _leave(" = FAILED");
- return CACHEFS_MATCH_FAILED;
+ struct afs_vnode *vnode = cookie_netfs_data;
+ uint16_t dlen;
+
+ _enter("{%x,%x,%llx},%p,%u",
+ vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+ buffer, buflen);
+
+ /* check the size of the data is what we're expecting */
+ dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+ if (dlen != buflen) {
+ _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+ return FSCACHE_CHECKAUX_OBSOLETE;
}
- if (vnode->fid.unique != cvnode->vnode_unique ||
- vnode->status.version != cvnode->data_version) {
- _leave(" = DELETE");
- return CACHEFS_MATCH_SUCCESS_DELETE;
+ if (memcmp(buffer,
+ &vnode->fid.unique,
+ sizeof(vnode->fid.unique)
+ ) != 0) {
+ unsigned unique;
+
+ memcpy(&unique, buffer, sizeof(unique));
+
+ _leave(" = OBSOLETE [uniq %x != %x]",
+ unique, vnode->fid.unique);
+ return FSCACHE_CHECKAUX_OBSOLETE;
+ }
+
+ if (memcmp(buffer + sizeof(vnode->fid.unique),
+ &vnode->status.data_version,
+ sizeof(vnode->status.data_version)
+ ) != 0) {
+ afs_dataversion_t version;
+
+ memcpy(&version, buffer + sizeof(vnode->fid.unique),
+ sizeof(version));
+
+ _leave(" = OBSOLETE [vers %llx != %llx]",
+ version, vnode->status.data_version);
+ return FSCACHE_CHECKAUX_OBSOLETE;
}
_leave(" = SUCCESS");
- return CACHEFS_MATCH_SUCCESS;
+ return FSCACHE_CHECKAUX_OKAY;
}
-#endif
/*
- * update a vnode record stored in the cache
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ * is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
*/
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vnode_cache_update(void *source, void *entry)
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
{
- struct afs_cache_vnode *cvnode = entry;
- struct afs_vnode *vnode = source;
+ struct afs_vnode *vnode = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ _enter("{%x,%x,%Lx}",
+ vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ for (;;) {
+ /* grab a bunch of pages to clean */
+ nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+ first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+ if (!nr_pages)
+ break;
- _enter("");
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
- cvnode->vnode_id = vnode->fid.vnode;
- cvnode->vnode_unique = vnode->fid.unique;
- cvnode->data_version = vnode->status.version;
+ _leave("");
}
-#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 36a3642cf90..5c4f6b499e9 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,6 +1,6 @@
/* AFS local cache management interface
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
* 2 of the License, or (at your option) any later version.
*/
-#ifndef AFS_CACHE_H
-#define AFS_CACHE_H
-
-#undef AFS_CACHING_SUPPORT
-
-#include <linux/mm.h>
-#ifdef AFS_CACHING_SUPPORT
-#include <linux/cachefs.h>
-#endif
-#include "types.h"
-
-#endif /* AFS_CACHE_H */
+#include <linux/fscache.h>
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 5e1df14e16b..e19c13f059e 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
if (ret < 0)
goto error;
-#ifdef AFS_CACHING_SUPPORT
- /* put it up for caching */
- cachefs_acquire_cookie(afs_cache_netfs.primary_index,
- &afs_vlocation_cache_index_def,
- cell,
- &cell->cache);
+#ifdef CONFIG_AFS_FSCACHE
+ /* put it up for caching (this never returns an error) */
+ cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+ &afs_cell_cache_index_def,
+ cell);
#endif
/* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
list_del_init(&cell->proc_link);
up_write(&afs_proc_cells_sem);
-#ifdef AFS_CACHING_SUPPORT
- cachefs_relinquish_cookie(cell->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_relinquish_cookie(cell->cache, 0);
#endif
-
key_put(cell->anonymous_key);
kfree(cell);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a3901769a96..7a1d942ef68 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static int afs_launder_page(struct page *page);
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+
const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
const struct address_space_operations afs_fs_aops = {
.readpage = afs_readpage,
+ .readpages = afs_readpages,
.set_page_dirty = afs_set_page_dirty,
.launder_page = afs_launder_page,
.releasepage = afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
/*
* deal with notification that a page was read from the cache
*/
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_read_complete(void *cookie_data,
- struct page *page,
- void *data,
- int error)
+static void afs_file_readpage_read_complete(struct page *page,
+ void *data,
+ int error)
{
- _enter("%p,%p,%p,%d", cookie_data, page, data, error);
+ _enter("%p,%p,%d", page, data, error);
- if (error)
- SetPageError(page);
- else
+ /* if the read completes with an error, we just unlock the page and let
+ * the VM reissue the readpage */
+ if (!error)
SetPageUptodate(page);
unlock_page(page);
-
}
-#endif
-
-/*
- * deal with notification that a page was written to the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_write_complete(void *cookie_data,
- struct page *page,
- void *data,
- int error)
-{
- _enter("%p,%p,%p,%d", cookie_data, page, data, error);
-
- unlock_page(page);
-}
-#endif
/*
* AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
goto error;
-#ifdef AFS_CACHING_SUPPORT
/* is it cached? */
- ret = cachefs_read_or_alloc_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
+ ret = fscache_read_or_alloc_page(vnode->cache,
page,
afs_file_readpage_read_complete,
NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
#else
ret = -ENOBUFS;
#endif
-
switch (ret) {
- /* read BIO submitted and wb-journal entry found */
- case 1:
- BUG(); // TODO - handle wb-journal match
-
/* read BIO submitted (page in cache) */
case 0:
break;
- /* no page available in cache */
- case -ENOBUFS:
+ /* page not yet cached */
case -ENODATA:
+ _debug("cache said ENODATA");
+ goto go_on;
+
+ /* page will not be cached */
+ case -ENOBUFS:
+ _debug("cache said ENOBUFS");
default:
+ go_on:
offset = page->index << PAGE_CACHE_SHIFT;
len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
set_bit(AFS_VNODE_DELETED, &vnode->flags);
ret = -ESTALE;
}
-#ifdef AFS_CACHING_SUPPORT
- cachefs_uncache_page(vnode->cache, page);
+
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_uncache_page(vnode->cache, page);
#endif
+ BUG_ON(PageFsCache(page));
goto error;
}
SetPageUptodate(page);
-#ifdef AFS_CACHING_SUPPORT
- if (cachefs_write_page(vnode->cache,
- page,
- afs_file_readpage_write_complete,
- NULL,
- GFP_KERNEL) != 0
- ) {
- cachefs_uncache_page(vnode->cache, page);
- unlock_page(page);
+ /* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+ if (PageFsCache(page) &&
+ fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+ fscache_uncache_page(vnode->cache, page);
+ BUG_ON(PageFsCache(page));
}
-#else
- unlock_page(page);
#endif
+ unlock_page(page);
}
_leave(" = 0");
@@ -232,34 +216,59 @@ error:
}
/*
- * invalidate part or all of a page
+ * read a set of pages
*/
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static int afs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
{
- int ret = 1;
+ struct afs_vnode *vnode;
+ int ret = 0;
- _enter("{%lu},%lu", page->index, offset);
+ _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
- BUG_ON(!PageLocked(page));
+ vnode = AFS_FS_I(mapping->host);
+ if (vnode->flags & AFS_VNODE_DELETED) {
+ _leave(" = -ESTALE");
+ return -ESTALE;
+ }
- if (PagePrivate(page)) {
- /* We release buffers only if the entire page is being
- * invalidated.
- * The get_block cached value has been unconditionally
- * invalidated, so real IO is not possible anymore.
- */
- if (offset == 0) {
- BUG_ON(!PageLocked(page));
-
- ret = 0;
- if (!PageWriteback(page))
- ret = page->mapping->a_ops->releasepage(page,
- 0);
- /* possibly should BUG_ON(!ret); - neilb */
- }
+ /* attempt to read as many of the pages as possible */
+#ifdef CONFIG_AFS_FSCACHE
+ ret = fscache_read_or_alloc_pages(vnode->cache,
+ mapping,
+ pages,
+ &nr_pages,
+ afs_file_readpage_read_complete,
+ NULL,
+ mapping_gfp_mask(mapping));
+#else
+ ret = -ENOBUFS;
+#endif
+
+ switch (ret) {
+ /* all pages are being read from the cache */
+ case 0:
+ BUG_ON(!list_empty(pages));
+ BUG_ON(nr_pages != 0);
+ _leave(" = 0 [reading all]");
+ return 0;
+
+ /* there were pages that couldn't be read from the cache */
+ case -ENODATA:
+ case -ENOBUFS:
+ break;
+
+ /* other error */
+ default:
+ _leave(" = %d", ret);
+ return ret;
}
- _leave(" = %d", ret);
+ /* load the missing pages from the network */
+ ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+
+ _leave(" = %d [netting]", ret);
+ return ret;
}
/*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
}
/*
- * release a page and cleanup its private data
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ * the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+
+ _enter("{%lu},%lu", page->index, offset);
+
+ BUG_ON(!PageLocked(page));
+
+ /* we clean up only if the entire page is being invalidated */
+ if (offset == 0) {
+#ifdef CONFIG_AFS_FSCACHE
+ if (PageFsCache(page)) {
+ struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ fscache_wait_on_page_write(vnode->cache, page);
+ fscache_uncache_page(vnode->cache, page);
+ ClearPageFsCache(page);
+ }
+#endif
+
+ if (PagePrivate(page)) {
+ if (wb && !PageWriteback(page)) {
+ set_page_private(page, 0);
+ afs_put_writeback(wb);
+ }
+
+ if (!page_private(page))
+ ClearPagePrivate(page);
+ }
+ }
+
+ _leave("");
+}
+
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
*/
static int afs_releasepage(struct page *page, gfp_t gfp_flags)
{
+ struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- struct afs_writeback *wb;
_enter("{{%x:%u}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
gfp_flags);
+ /* deny if page is being written to the cache and the caller hasn't
+ * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+ if (PageFsCache(page)) {
+ if (fscache_check_page_write(vnode->cache, page)) {
+ if (!(gfp_flags & __GFP_WAIT)) {
+ _leave(" = F [cache busy]");
+ return 0;
+ }
+ fscache_wait_on_page_write(vnode->cache, page);
+ }
+
+ fscache_uncache_page(vnode->cache, page);
+ ClearPageFsCache(page);
+ }
+#endif
+
if (PagePrivate(page)) {
- wb = (struct afs_writeback *) page_private(page);
- ASSERT(wb != NULL);
- set_page_private(page, 0);
+ if (wb) {
+ set_page_private(page, 0);
+ afs_put_writeback(wb);
+ }
ClearPagePrivate(page);
- afs_put_writeback(wb);
}
- _leave(" = 0");
- return 0;
+ /* indicate that the page can be released */
+ _leave(" = T");
+ return 1;
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index bb47217f6a1..c048f065875 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
return -EBADMSG;
}
+#ifdef CONFIG_AFS_FSCACHE
+ if (vnode->status.size != inode->i_size)
+ fscache_attr_changed(vnode->cache);
+#endif
+
inode->i_nlink = vnode->status.nlink;
inode->i_uid = vnode->status.owner;
inode->i_gid = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
return inode;
}
-#ifdef AFS_CACHING_SUPPORT
- /* set up caching before reading the status, as fetch-status reads the
- * first page of symlinks to see if they're really mntpts */
- cachefs_acquire_cookie(vnode->volume->cache,
- NULL,
- vnode,
- &vnode->cache);
-#endif
-
if (!status) {
/* it's a remotely extant inode */
set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
}
}
+ /* set up caching before mapping the status, as map-status reads the
+ * first page of symlinks to see if they're really mountpoints */
+ inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+ vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+ &afs_vnode_cache_index_def,
+ vnode);
+#endif
+
ret = afs_inode_map_status(vnode, key);
if (ret < 0)
goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
/* failure */
bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_relinquish_cookie(vnode->cache, 0);
+ vnode->cache = NULL;
+#endif
iget_failed(inode);
_leave(" = %d [bad]", ret);
return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
ASSERT(list_empty(&vnode->writebacks));
ASSERT(!vnode->cb_promised);
-#ifdef AFS_CACHING_SUPPORT
- cachefs_relinquish_cookie(vnode->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_relinquish_cookie(vnode->cache, 0);
vnode->cache = NULL;
#endif
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 67f259d99cd..106be66dafd 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
#include "afs.h"
#include "afs_vl.h"
+#include "cache.h"
#define AFS_CELL_MAX_ADDRS 15
@@ -193,8 +194,8 @@ struct afs_cell {
struct key *anonymous_key; /* anonymous user key for this cell */
struct list_head proc_link; /* /proc cell list link */
struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+ struct fscache_cookie *cache; /* caching cookie */
#endif
/* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
struct list_head grave; /* link in master graveyard list */
struct list_head update; /* link in master update list */
struct afs_cell *cell; /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+ struct fscache_cookie *cache; /* caching cookie */
#endif
struct afs_cache_vlocation vldb; /* volume information DB record */
struct afs_volume *vols[3]; /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
atomic_t usage;
struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */
struct afs_vlocation *vlocation; /* volume location */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+ struct fscache_cookie *cache; /* caching cookie */
#endif
afs_volid_t vid; /* volume ID */
afs_voltype_t type; /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
struct afs_server *server; /* server currently supplying this file */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
- struct cachefs_cookie *cache; /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+ struct fscache_cookie *cache; /* caching cookie */
#endif
struct afs_permits *permits; /* cache of permits so far obtained */
struct mutex permits_lock; /* lock for altering permits list */
@@ -428,6 +429,22 @@ struct afs_uuid {
/*****************************************************************************/
/*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL)
+#endif
+
+/*
* callback.c
*/
extern void afs_init_callback_state(struct afs_server *);
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
*/
extern struct rw_semaphore afs_proc_cells_sem;
extern struct list_head afs_proc_cells;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_cache_cell_index_def;
-#endif
#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
* main.c
*/
extern struct afs_uuid afs_uuid;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_netfs afs_cache_netfs;
-#endif
/*
* misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
/*
* vlclient.c
*/
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-
extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
const char *, struct afs_cache_vlocation *,
const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
/*
* vnode.c
*/
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-
static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
{
return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
/*
* volume.c
*/
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-
#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
extern void afs_put_volume(struct afs_volume *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 2d3e5d4fb9f..66d54d348c5 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,6 +1,6 @@
/* AFS client file system
*
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
module_param(rootcell, charp, 0);
MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
-#ifdef AFS_CACHING_SUPPORT
-static struct cachefs_netfs_operations afs_cache_ops = {
- .get_page_cookie = afs_cache_get_page_cookie,
-};
-
-struct cachefs_netfs afs_cache_netfs = {
- .name = "afs",
- .version = 0,
- .ops = &afs_cache_ops,
-};
-#endif
-
struct afs_uuid afs_uuid;
/*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
if (ret < 0)
return ret;
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
/* we want to be able to cache */
- ret = cachefs_register_netfs(&afs_cache_netfs,
- &afs_cache_cell_index_def);
+ ret = fscache_register_netfs(&afs_cache_netfs);
if (ret < 0)
goto error_cache;
#endif
@@ -142,8 +129,8 @@ error_fs:
error_open_socket:
error_vl_update_init:
error_cell_init:
-#ifdef AFS_CACHING_SUPPORT
- cachefs_unregister_netfs(&afs_cache_netfs);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_unregister_netfs(&afs_cache_netfs);
error_cache:
#endif
afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
afs_vlocation_purge();
flush_scheduled_work();
afs_cell_purge();
-#ifdef AFS_CACHING_SUPPORT
- cachefs_unregister_netfs(&afs_cache_netfs);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_unregister_netfs(&afs_cache_netfs);
#endif
afs_proc_cleanup();
rcu_barrier();
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 78db4953a80..2b9e2d03a39 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
if (PageError(page))
goto error;
- buf = kmap(page);
+ buf = kmap_atomic(page, KM_USER0);
memcpy(devname, buf, size);
- kunmap(page);
+ kunmap_atomic(buf, KM_USER0);
page_cache_release(page);
page = NULL;
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 849fc3160cb..ec2a7431e45 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
vl->vldb = *vldb;
-#ifdef AFS_CACHING_SUPPORT
- /* update volume entry in local cache */
- cachefs_update_cookie(vl->cache);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_update_cookie(vl->cache);
#endif
}
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
memset(&vldb, 0, sizeof(vldb));
/* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef AFS_CACHING_SUPPORT
- cachefs_acquire_cookie(cell->cache,
- &afs_volume_cache_index_def,
- vlocation,
- &vl->cache);
+#ifdef CONFIG_AFS_FSCACHE
+ vl->cache = fscache_acquire_cookie(vl->cell->cache,
+ &afs_vlocation_cache_index_def, vl);
#endif
if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
spin_unlock(&vl->lock);
wake_up(&vl->waitq);
+ /* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_update_cookie(vl->cache);
+#endif
+
/* schedule for regular updates */
afs_vlocation_queue_for_updates(vl);
goto success;
@@ -465,7 +467,7 @@ found_in_memory:
spin_unlock(&vl->lock);
success:
- _leave(" = %p",vl);
+ _leave(" = %p", vl);
return vl;
error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
{
_enter("%p", vl);
-#ifdef AFS_CACHING_SUPPORT
- cachefs_relinquish_cookie(vl->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_relinquish_cookie(vl->cache, 0);
#endif
-
afs_put_cell(vl->cell);
kfree(vl);
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 8bab0e3437f..a353e69e239 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
}
/* attach the cache and volume location */
-#ifdef AFS_CACHING_SUPPORT
- cachefs_acquire_cookie(vlocation->cache,
- &afs_vnode_cache_index_def,
- volume,
- &volume->cache);
+#ifdef CONFIG_AFS_FSCACHE
+ volume->cache = fscache_acquire_cookie(vlocation->cache,
+ &afs_volume_cache_index_def,
+ volume);
#endif
-
afs_get_vlocation(vlocation);
volume->vlocation = vlocation;
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
up_write(&vlocation->cell->vl_sem);
/* finish cleaning up the volume */
-#ifdef AFS_CACHING_SUPPORT
- cachefs_relinquish_cookie(volume->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_relinquish_cookie(volume->cache, 0);
#endif
afs_put_vlocation(vlocation);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3fb36d43362..c2e7a7ff008 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
_leave(" = %d", ret);
return ret;
}
+
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+
+ _enter("{{%x:%u}},{%lx}",
+ vnode->fid.vid, vnode->fid.vnode, page->index);
+
+ /* wait for the page to be written to the cache before we allow it to
+ * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+ fscache_wait_on_page_write(vnode->cache, page);
+#endif
+
+ _leave(" = 0");
+ return 0;
+}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d0..b7ff33c6310 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int when);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
struct autofs_sb_info *, int __user *);
struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffe..9e5ae8a4f5c 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- struct dentry *dentry;
struct vfsmount *mnt;
- int err = -EAGAIN;
int how;
how = param->expire.how;
mnt = fp->f_path.mnt;
- if (autofs_type_trigger(sbi->type))
- dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
- else
- dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
-
- if (dentry) {
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
-
- /*
- * This is synchronous because it makes the daemon a
- * little easier
- */
- err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
-
- spin_lock(&sbi->fs_lock);
- if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
- ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
- sbi->sb->s_root->d_mounted++;
- }
- ino->flags &= ~AUTOFS_INF_EXPIRING;
- complete_all(&ino->expire_complete);
- spin_unlock(&sbi->fs_lock);
- dput(dentry);
- }
-
- return err;
+ return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
}
/* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9..75f7ddacf7d 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb,
return ret;
}
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
- more to be done */
-int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int __user *arg)
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int when)
{
struct dentry *dentry;
int ret = -EAGAIN;
- int do_now = 0;
-
- if (arg && get_user(do_now, arg))
- return -EFAULT;
if (autofs_type_trigger(sbi->type))
- dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
+ dentry = autofs4_expire_direct(sb, mnt, sbi, when);
else
- dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
+ dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
if (dentry) {
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
return ret;
}
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int __user *arg)
+{
+ int do_now = 0;
+
+ if (arg && get_user(do_now, arg))
+ return -EFAULT;
+
+ return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
+
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 74b1469a950..e383bf0334f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
- expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
- if (expiring) {
- /*
- * If we are racing with expire the request might not
- * be quite complete but the directory has been removed
- * so it must have been successful, so just wait for it.
- */
- ino = autofs4_dentry_ino(expiring);
- autofs4_expire_wait(expiring);
- spin_lock(&sbi->lookup_lock);
- if (!list_empty(&ino->expiring))
- list_del_init(&ino->expiring);
- spin_unlock(&sbi->lookup_lock);
- dput(expiring);
- }
-
unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
if (unhashed)
dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
}
if (!oz_mode) {
+ mutex_unlock(&dir->i_mutex);
+ expiring = autofs4_lookup_expiring(sbi,
+ dentry->d_parent,
+ &dentry->d_name);
+ if (expiring) {
+ /*
+ * If we are racing with expire the request might not
+ * be quite complete but the directory has been removed
+ * so it must have been successful, so just wait for it.
+ */
+ ino = autofs4_dentry_ino(expiring);
+ autofs4_expire_wait(expiring);
+ spin_lock(&sbi->lookup_lock);
+ if (!list_empty(&ino->expiring))
+ list_del_init(&ino->expiring);
+ spin_unlock(&sbi->lookup_lock);
+ dput(expiring);
+ }
+
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_AUTOFS_PENDING;
spin_unlock(&dentry->d_lock);
- if (dentry->d_op && dentry->d_op->d_revalidate) {
- mutex_unlock(&dir->i_mutex);
+ if (dentry->d_op && dentry->d_op->d_revalidate)
(dentry->d_op->d_revalidate)(dentry, nd);
- mutex_lock(&dir->i_mutex);
- }
+ mutex_lock(&dir->i_mutex);
}
/*
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d06cb023ad0..76afd0d6b86 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -900,6 +900,7 @@ static int
befs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
befs_debug(sb, "---> befs_statfs()");
@@ -910,6 +911,8 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = 0; /* UNKNOWN */
buf->f_ffree = 0; /* UNKNOWN */
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = BEFS_NAME_LEN;
befs_debug(sb, "<--- befs_statfs()");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 33b7235f853..40381df3486 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -12,8 +12,6 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/time.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/errno.h>
@@ -21,20 +19,15 @@
#include <linux/binfmts.h>
#include <linux/string.h>
#include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
#include <linux/slab.h>
-#include <linux/shm.h>
#include <linux/personality.h>
#include <linux/elfcore.h>
#include <linux/init.h>
#include <linux/highuid.h>
-#include <linux/smp.h>
#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/security.h>
-#include <linux/syscalls.h>
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/utsname.h>
@@ -576,7 +569,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata;
unsigned long elf_bss, elf_brk;
- int elf_exec_fileno;
int retval, i;
unsigned int size;
unsigned long elf_entry;
@@ -631,12 +623,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto out_free_ph;
}
- retval = get_unused_fd();
- if (retval < 0)
- goto out_free_ph;
- get_file(bprm->file);
- fd_install(elf_exec_fileno = retval, bprm->file);
-
elf_ppnt = elf_phdata;
elf_bss = 0;
elf_brk = 0;
@@ -655,13 +641,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
retval = -ENOEXEC;
if (elf_ppnt->p_filesz > PATH_MAX ||
elf_ppnt->p_filesz < 2)
- goto out_free_file;
+ goto out_free_ph;
retval = -ENOMEM;
elf_interpreter = kmalloc(elf_ppnt->p_filesz,
GFP_KERNEL);
if (!elf_interpreter)
- goto out_free_file;
+ goto out_free_ph;
retval = kernel_read(bprm->file, elf_ppnt->p_offset,
elf_interpreter,
@@ -956,8 +942,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
kfree(elf_phdata);
- sys_close(elf_exec_fileno);
-
set_binfmt(&elf_format);
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
@@ -1028,8 +1012,6 @@ out_free_dentry:
fput(interpreter);
out_free_interp:
kfree(elf_interpreter);
-out_free_file:
- sys_close(elf_exec_fileno);
out_free_ph:
kfree(elf_phdata);
goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3e72c5c19f..70cfc4b84ae 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -972,9 +972,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
params->elfhdr_addr = seg->addr;
/* clear any space allocated but not loaded */
- if (phdr->p_filesz < phdr->p_memsz)
- clear_user((void *) (seg->addr + phdr->p_filesz),
- phdr->p_memsz - phdr->p_filesz);
+ if (phdr->p_filesz < phdr->p_memsz) {
+ ret = clear_user((void *) (seg->addr + phdr->p_filesz),
+ phdr->p_memsz - phdr->p_filesz);
+ if (ret)
+ return ret;
+ }
if (mm) {
if (phdr->p_flags & PF_X) {
@@ -1014,7 +1017,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
struct elf32_fdpic_loadseg *seg;
struct elf32_phdr *phdr;
unsigned long load_addr, delta_vaddr;
- int loop, dvset;
+ int loop, dvset, ret;
load_addr = params->load_addr;
delta_vaddr = 0;
@@ -1114,7 +1117,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
* PT_LOAD */
if (prot & PROT_WRITE && disp > 0) {
kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
- clear_user((void __user *) maddr, disp);
+ ret = clear_user((void __user *) maddr, disp);
+ if (ret)
+ return ret;
maddr += disp;
}
@@ -1149,15 +1154,19 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
if (prot & PROT_WRITE && excess1 > 0) {
kdebug("clear[%d] ad=%lx sz=%lx",
loop, maddr + phdr->p_filesz, excess1);
- clear_user((void __user *) maddr + phdr->p_filesz,
- excess1);
+ ret = clear_user((void __user *) maddr + phdr->p_filesz,
+ excess1);
+ if (ret)
+ return ret;
}
#else
if (excess > 0) {
kdebug("clear[%d] ad=%lx sz=%lx",
loop, maddr + phdr->p_filesz, excess);
- clear_user((void *) maddr + phdr->p_filesz, excess);
+ ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+ if (ret)
+ return ret;
}
#endif
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 08644a61616..eff74b9c9e7 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -188,7 +188,6 @@ out:
static int
load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
{
- int som_exec_fileno;
int retval;
unsigned int size;
unsigned long som_entry;
@@ -220,12 +219,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
goto out_free;
}
- retval = get_unused_fd();
- if (retval < 0)
- goto out_free;
- get_file(bprm->file);
- fd_install(som_exec_fileno = retval, bprm->file);
-
/* Flush all traces of the currently running executable */
retval = flush_old_exec(bprm);
if (retval)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c3c6899ccf..f45dbc18dd1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev)
}
return sync_blockdev(bdev);
}
+EXPORT_SYMBOL(fsync_bdev);
/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b..9adf5e4f7e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
- compression.o
+ compression.o delayed-ref.o
else
# Normal Makefile
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba..7fdd184a528 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
}
if (!acl)
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74..b30986f00b9 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
+ /*
+ * list for tracking inodes that must be sent to disk before a
+ * rename or truncate commit
+ */
+ struct list_head ordered_operations;
+
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
@@ -86,12 +92,6 @@ struct btrfs_inode {
*/
u64 logged_trans;
- /*
- * trans that last made a change that should be fully fsync'd. This
- * gets reset to zero each time the inode is logged
- */
- u64 log_dirty_trans;
-
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
*/
@@ -121,6 +121,25 @@ struct btrfs_inode {
/* the start of block group preferred for allocations. */
u64 block_group;
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ *
+ * yes, its silly to have a single bitflag, but we might grow more
+ * of these.
+ */
+ unsigned ordered_data_close:1;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529a..dbb72412463 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
* empty_size -- a hint that you plan on doing more cow. This is the size in
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
*/
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- u64 prealloc_dest)
+ u64 search_start, u64 empty_size)
{
u64 parent_start;
struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
level = btrfs_header_level(buf);
nritems = btrfs_header_nritems(buf);
- if (prealloc_dest) {
- struct btrfs_key ins;
-
- ins.objectid = prealloc_dest;
- ins.offset = buf->len;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
- ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
- root->root_key.objectid,
- trans->transid, level, &ins);
- BUG_ON(ret);
- cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
- buf->len, level);
- } else {
- cow = btrfs_alloc_free_block(trans, root, buf->len,
- parent_start,
- root->root_key.objectid,
- trans->transid, level,
- search_start, empty_size);
- }
+ cow = btrfs_alloc_free_block(trans, root, buf->len,
+ parent_start, root->root_key.objectid,
+ trans->transid, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest)
+ struct extent_buffer **cow_ret)
{
u64 search_start;
int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_owner(buf) == root->root_key.objectid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
*cow_ret = buf;
- WARN_ON(prealloc_dest);
return 0;
}
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0,
- prealloc_dest);
+ parent_slot, cow_ret, search_start, 0);
return ret;
}
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize), 0);
+ (end_slot - i) * blocksize));
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(!child);
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
BUG_ON(ret);
spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, child->start,
+ child->len,
mid->start, child->start,
root->root_key.objectid,
trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
+ if (trans->transaction->delayed_refs.flushing &&
+ btrfs_header_nritems(mid) > 2)
+ return 0;
+
if (btrfs_header_nritems(mid) < 2)
err_on_enospc = 1;
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left, 0);
+ parent, pslot - 1, &left);
if (wret) {
ret = wret;
goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right, 0);
+ parent, pslot + 1, &right);
if (wret) {
ret = wret;
goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left, 0);
+ pslot - 1, &left);
if (ret)
wret = 1;
else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right, 0);
+ &right);
if (ret)
wret = 1;
else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
u8 lowest_level = 0;
u64 blocknr;
u64 gen;
- struct btrfs_key prealloc_block;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (ins_len < 0)
lowest_unlock = 2;
- prealloc_block.objectid = 0;
-
again:
if (p->skip_locking)
b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
!btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
goto cow_done;
}
-
- /* ok, we have to cow, is our old prealloc the right
- * size?
- */
- if (prealloc_block.objectid &&
- prealloc_block.offset != b->len) {
- btrfs_release_path(root, p);
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- prealloc_block.objectid = 0;
- goto again;
- }
-
- /*
- * for higher level blocks, try not to allocate blocks
- * with the block and the parent locks held.
- */
- if (level > 0 && !prealloc_block.objectid) {
- u32 size = b->len;
- u64 hint = b->start;
-
- btrfs_release_path(root, p);
- ret = btrfs_reserve_extent(trans, root,
- size, size, 0,
- hint, (u64)-1,
- &prealloc_block, 0);
- BUG_ON(ret);
- goto again;
- }
-
btrfs_set_path_blocking(p);
wret = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1],
- &b, prealloc_block.objectid);
- prealloc_block.objectid = 0;
+ p->slots[level + 1], &b);
if (wret) {
free_extent_buffer(b);
ret = wret;
@@ -1742,12 +1688,8 @@ done:
* we don't really know what they plan on doing with the path
* from here on, so for now just mark it as blocking
*/
- btrfs_set_path_blocking(p);
- if (prealloc_block.objectid) {
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- }
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
return ret;
}
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
int ret;
eb = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+ ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
BUG_ON(ret);
btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
}
ret = btrfs_cow_block(trans, root, eb, parent, slot,
- &eb, 0);
+ &eb);
BUG_ON(ret);
if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, lower->start,
- lower->start, c->start,
+ lower->len, lower->start, c->start,
root->root_key.objectid,
trans->transid, level - 1);
BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
- } else {
+ } else if (!trans->transaction->delayed_refs.flushing) {
ret = push_nodes_for_insert(trans, root, path, level);
c = path->nodes[level];
if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
return ret;
}
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes. returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems)
{
struct extent_buffer *left = path->nodes[0];
- struct extent_buffer *right;
- struct extent_buffer *upper;
+ struct extent_buffer *upper = path->nodes[1];
struct btrfs_disk_key disk_key;
int slot;
u32 i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
- u32 left_nritems;
u32 nr;
u32 right_nritems;
u32 data_end;
u32 this_item_size;
int ret;
- slot = path->slots[1];
- if (!path->nodes[1])
- return 1;
-
- upper = path->nodes[1];
- if (slot >= btrfs_header_nritems(upper) - 1)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- right = read_node_slot(root, upper, slot + 1);
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right, 0);
- if (ret)
- goto out_unlock;
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- left_nritems = btrfs_header_nritems(left);
- if (left_nritems == 0)
- goto out_unlock;
-
if (empty)
nr = 0;
else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (path->slots[0] >= left_nritems)
push_space += data_size;
+ slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
item = btrfs_item_nr(left, i);
@@ -2528,24 +2432,82 @@ out_unlock:
}
/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ right = read_node_slot(root, upper, slot + 1);
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ return __push_leaf_right(trans, root, path, data_size, empty,
+ right, free_space, left_nritems);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
*/
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int data_size,
+ int empty, struct extent_buffer *left,
+ int free_space, int right_nritems)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
- struct extent_buffer *left;
int slot;
int i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
u32 old_left_nritems;
- u32 right_nritems;
u32 nr;
int ret = 0;
int wret;
@@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
u32 old_left_item_size;
slot = path->slots[1];
- if (slot == 0)
- return 1;
- if (!path->nodes[1])
- return 1;
-
- right_nritems = btrfs_header_nritems(right);
- if (right_nritems == 0)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- left = read_node_slot(root, path->nodes[1], slot - 1);
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
-
- free_space = btrfs_leaf_free_space(root, left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left, 0);
- if (ret) {
- /* we hit -ENOSPC, but it isn't fatal here */
- ret = 1;
- goto out;
- }
-
- free_space = btrfs_leaf_free_space(root, left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
if (empty)
nr = right_nritems;
@@ -2755,6 +2682,154 @@ out:
}
/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int free_space;
+ u32 right_nritems;
+ int ret = 0;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ left = read_node_slot(root, path->nodes[1], slot - 1);
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ ret = 1;
+ goto out;
+ }
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ return __push_leaf_left(trans, root, path, data_size,
+ empty, left, free_space, right_nritems);
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
+{
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ int ret = 0;
+ int wret;
+ struct btrfs_disk_key disk_key;
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ data_copy_size, btrfs_leaf_data(l) +
+ leaf_data_end(root, l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_end_nr(l, mid);
+
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_item *item = btrfs_item_nr(right, i);
+ u32 ioff;
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(right, item);
+ btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ ret = 0;
+ btrfs_item_key(right, &disk_key, 0);
+ wret = insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ BUG_ON(ret);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+
+ return ret;
+}
+
+/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*
@@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int mid;
int slot;
struct extent_buffer *right;
- int data_copy_size;
- int rt_data_off;
- int i;
int ret = 0;
int wret;
int double_split;
int num_doubles = 0;
- struct btrfs_disk_key disk_key;
/* first try to make some room by pushing left and right */
- if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+ !trans->transaction->delayed_refs.flushing) {
wret = push_leaf_right(trans, root, path, data_size, 0);
if (wret < 0)
return wret;
@@ -2830,11 +2902,14 @@ again:
write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
(unsigned long)btrfs_header_chunk_tree_uuid(right),
BTRFS_UUID_SIZE);
+
if (mid <= slot) {
if (nritems == 1 ||
leaf_space_used(l, mid, nritems - mid) + data_size >
BTRFS_LEAF_DATA_SIZE(root)) {
if (slot >= nritems) {
+ struct btrfs_disk_key disk_key;
+
btrfs_cpu_key_to_disk(&disk_key, ins_key);
btrfs_set_header_nritems(right, 0);
wret = insert_ptr(trans, root, path,
@@ -2862,6 +2937,8 @@ again:
if (leaf_space_used(l, 0, mid) + data_size >
BTRFS_LEAF_DATA_SIZE(root)) {
if (!extend && data_size && slot == 0) {
+ struct btrfs_disk_key disk_key;
+
btrfs_cpu_key_to_disk(&disk_key, ins_key);
btrfs_set_header_nritems(right, 0);
wret = insert_ptr(trans, root, path,
@@ -2894,76 +2971,16 @@ again:
}
}
}
- nritems = nritems - mid;
- btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
- copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(mid),
- nritems * sizeof(struct btrfs_item));
-
- copy_extent_buffer(right, l,
- btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
- data_copy_size, btrfs_leaf_data(l) +
- leaf_data_end(root, l), data_copy_size);
-
- rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
- btrfs_item_end_nr(l, mid);
-
- for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(right, i);
- u32 ioff;
-
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
-
- ioff = btrfs_item_offset(right, item);
- btrfs_set_item_offset(right, item, ioff + rt_data_off);
- }
-
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
-
- btrfs_set_header_nritems(l, mid);
- ret = 0;
- btrfs_item_key(right, &disk_key, 0);
- wret = insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
-
- btrfs_mark_buffer_dirty(right);
- btrfs_mark_buffer_dirty(l);
- BUG_ON(path->slots[0] != slot);
- ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
BUG_ON(ret);
- if (mid <= slot) {
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
- path->nodes[0] = right;
- path->slots[0] -= mid;
- path->slots[1] += 1;
- } else {
- btrfs_tree_unlock(right);
- free_extent_buffer(right);
- }
-
- BUG_ON(path->slots[0] < 0);
-
if (double_split) {
BUG_ON(num_doubles != 0);
num_doubles++;
goto again;
}
+
return ret;
}
@@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
return -EAGAIN;
}
+ btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &orig_key, path,
sizeof(struct btrfs_item), 1);
path->keep_locks = 0;
BUG_ON(ret);
+ btrfs_unlock_up_safe(path, 1);
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
/*
* make sure any changes to the path from split_leaf leave it
* in a blocking state
*/
btrfs_set_path_blocking(path);
- leaf = path->nodes[0];
- BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
item = btrfs_item_nr(leaf, path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
-
buf = kmalloc(item_size, GFP_NOFS);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
path->slots[0]), item_size);
@@ -3445,39 +3463,27 @@ out:
}
/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
*/
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
- struct extent_buffer *leaf;
struct btrfs_item *item;
- int ret = 0;
- int slot;
- int slot_orig;
int i;
u32 nritems;
- u32 total_size = 0;
- u32 total_data = 0;
unsigned int data_end;
struct btrfs_disk_key disk_key;
+ int ret;
+ struct extent_buffer *leaf;
+ int slot;
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
-
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
- if (ret == 0)
- return -EEXIST;
- if (ret < 0)
- goto out;
-
- slot_orig = path->slots[0];
leaf = path->nodes[0];
+ slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
BUG();
}
- slot = path->slots[0];
- BUG_ON(slot < 0);
-
if (slot != nritems) {
unsigned int old_data = btrfs_item_end_nr(leaf, slot);
@@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
data_end -= data_size[i];
btrfs_set_item_size(leaf, item, data_size[i]);
}
+
btrfs_set_header_nritems(leaf, nritems + nr);
- btrfs_mark_buffer_dirty(leaf);
ret = 0;
if (slot == 0) {
+ struct btrfs_disk_key disk_key;
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
ret = fixup_low_keys(trans, root, path, &disk_key, 1);
}
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(root, leaf) < 0) {
btrfs_print_leaf(root, leaf);
BUG();
}
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ int ret = 0;
+ int slot;
+ int i;
+ u32 total_size = 0;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ total_data, total_size, nr);
+
out:
- btrfs_unlock_up_safe(path, 1);
return ret;
}
@@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/* delete the leaf if it is mostly empty */
- if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+ !trans->transaction->delayed_refs.flushing) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
@@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
slot = path->slots[1];
extent_buffer_get(leaf);
+ btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d..9417713542a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
#define BTRFS_MAX_LEVEL 8
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list. That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -401,15 +408,16 @@ struct btrfs_path {
int locks[BTRFS_MAX_LEVEL];
int reada;
/* keep some upper locks as we walk down */
- int keep_locks;
- int skip_locking;
int lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
* and to force calls to keep space in the nodes
*/
- int search_for_split;
+ unsigned int search_for_split:1;
+ unsigned int keep_locks:1;
+ unsigned int skip_locking:1;
+ unsigned int leave_spinning:1;
};
/*
@@ -688,15 +696,18 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
struct extent_io_tree pinned_extents;
- struct extent_io_tree pending_del;
- struct extent_io_tree extent_ins;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
u64 generation;
u64 last_trans_committed;
- u64 last_trans_new_blockgroup;
+
+ /*
+ * this is updated to the current trans every time a full commit
+ * is required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
u64 open_ioctl_trans;
unsigned long mount_opt;
u64 max_extent;
@@ -717,12 +728,21 @@ struct btrfs_fs_info {
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
- struct mutex extent_ins_mutex;
struct mutex pinned_mutex;
struct mutex chunk_mutex;
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
+
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+ * sure the commit code doesn't find the list temporarily empty
+ * because another function happens to be doing non-waiting preflush
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
@@ -737,10 +757,29 @@ struct btrfs_fs_info {
* ordered extents
*/
spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
struct list_head ordered_extents;
+
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
struct list_head delalloc_inodes;
/*
+ * special rename and truncate targets that must be on disk before
+ * we're allowed to commit. This is basically the ext3 style
+ * data=ordered list.
+ */
+ struct list_head ordered_operations;
+
+ /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming after reads. This is because readers
* can run with FS locks held, and the writers may be waiting for
@@ -781,6 +820,11 @@ struct btrfs_fs_info {
atomic_t throttle_gen;
u64 total_pinned;
+
+ /* protected by the delalloc lock, used to keep from writing
+ * metadata until there is a nice batch
+ */
+ u64 dirty_metadata_bytes;
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
@@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file)
}
/* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u32 *refs);
int btrfs_update_pinned_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int pin);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
@@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 ref_generation,
u64 owner_objectid);
int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 orig_parent, u64 parent,
u64 root_objectid, u64 ref_generation,
u64 owner_objectid);
@@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest);
+ struct extent_buffer **cow_ret);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2060,7 +2101,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
unsigned long btrfs_force_ra(struct address_space *mapping,
struct file_ra_state *ra, struct file *file,
pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_delete_inode(struct inode *inode);
void btrfs_put_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 00000000000..cbf7dc8ae3e
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include <linux/ftrace.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking. For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing. This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 parent)
+{
+ if (bytenr < ref->bytenr)
+ return -1;
+ if (bytenr > ref->bytenr)
+ return 1;
+ if (parent < ref->parent)
+ return -1;
+ if (parent > ref->parent)
+ return 1;
+ return 0;
+}
+
+/*
+ * insert a new ref into the rbtree. This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+ u64 bytenr, u64 parent,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ rb_node);
+
+ cmp = comp_entry(entry, bytenr, parent);
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * find an entry based on (bytenr,parent). This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+ u64 bytenr, u64 parent,
+ struct btrfs_delayed_ref_node **last)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+ if (last)
+ *last = entry;
+
+ cmp = comp_entry(entry, bytenr, parent);
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ assert_spin_locked(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (!head->node.in_tree) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 start)
+{
+ int count = 0;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (start == 0) {
+ node = rb_first(&delayed_refs->root);
+ } else {
+ ref = NULL;
+ tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+ if (ref) {
+ struct btrfs_delayed_ref_node *tmp;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ tmp = rb_entry(node,
+ struct btrfs_delayed_ref_node,
+ rb_node);
+ if (tmp->bytenr < start)
+ break;
+ ref = tmp;
+ node = rb_prev(&ref->rb_node);
+ }
+ node = &ref->rb_node;
+ } else
+ node = rb_first(&delayed_refs->root);
+ }
+again:
+ while (node && count < 32) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (list_empty(&head->cluster)) {
+ list_add_tail(&head->cluster, cluster);
+ delayed_refs->run_delayed_start =
+ head->node.bytenr;
+ count++;
+
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ } else if (count) {
+ /* the goal of the clustering is to find extents
+ * that are likely to end up in the same extent
+ * leaf on disk. So, we don't want them spread
+ * all over the tree. Stop now if we've hit
+ * a head that was already in use
+ */
+ break;
+ }
+ }
+ node = rb_next(node);
+ }
+ if (count) {
+ return 0;
+ } else if (start) {
+ /*
+ * we've gone to the end of the rbtree without finding any
+ * clusters. start from the beginning and try again
+ */
+ start = 0;
+ node = rb_first(&delayed_refs->root);
+ goto again;
+ }
+ return 1;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr. It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree. There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *prev_node;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref) {
+ prev_node = rb_prev(&ref->rb_node);
+ if (!prev_node)
+ goto out;
+ ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr == bytenr)
+ ret = 1;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u32 num_refs;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ delayed_refs = &trans->transaction->delayed_refs;
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ } else {
+ num_refs = 0;
+ ret = 0;
+ }
+
+ spin_lock(&delayed_refs->lock);
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (mutex_trylock(&head->mutex)) {
+ num_refs += ref->ref_mod;
+ mutex_unlock(&head->mutex);
+ *refs = num_refs;
+ goto out;
+ }
+
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+ goto again;
+ } else {
+ *refs = num_refs;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree. existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref *existing_ref;
+ struct btrfs_delayed_ref *ref;
+
+ existing_ref = btrfs_delayed_node_to_ref(existing);
+ ref = btrfs_delayed_node_to_ref(update);
+
+ if (ref->pin)
+ existing_ref->pin = 1;
+
+ if (ref->action != existing_ref->action) {
+ /*
+ * this is effectively undoing either an add or a
+ * drop. We decrement the ref_mod, and if it goes
+ * down to zero we just delete the entry without
+ * every changing the extent allocation tree.
+ */
+ existing->ref_mod--;
+ if (existing->ref_mod == 0) {
+ rb_erase(&existing->rb_node,
+ &delayed_refs->root);
+ existing->in_tree = 0;
+ btrfs_put_delayed_ref(existing);
+ delayed_refs->num_entries--;
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+ }
+ } else {
+ if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+ /* if we're adding refs, make sure all the
+ * details match up. The extent could
+ * have been totally freed and reallocated
+ * by a different owner before the delayed
+ * ref entries were removed.
+ */
+ existing_ref->owner_objectid = ref->owner_objectid;
+ existing_ref->generation = ref->generation;
+ existing_ref->root = ref->root;
+ existing->num_bytes = update->num_bytes;
+ }
+ /*
+ * the action on the existing ref matches
+ * the action on the ref we're trying to add.
+ * Bump the ref_mod by one so the backref that
+ * is eventually added/removed has the correct
+ * reference count
+ */
+ existing->ref_mod += update->ref_mod;
+ }
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref_head *existing_ref;
+ struct btrfs_delayed_ref_head *ref;
+
+ existing_ref = btrfs_delayed_node_to_head(existing);
+ ref = btrfs_delayed_node_to_head(update);
+
+ if (ref->must_insert_reserved) {
+ /* if the extent was freed and then
+ * reallocated before the delayed ref
+ * entries were processed, we can end up
+ * with an existing head ref without
+ * the must_insert_reserved flag set.
+ * Set it again here
+ */
+ existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+ /*
+ * update the num_bytes so we make sure the accounting
+ * is done correctly
+ */
+ existing->num_bytes = update->num_bytes;
+
+ }
+
+ /*
+ * update the reference mod on the head to reflect this new operation
+ */
+ existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_ref *full_ref;
+ struct btrfs_delayed_ref_head *head_ref = NULL;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int count_mod = 1;
+ int must_insert_reserved = 0;
+
+ /*
+ * the head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
+ if (parent == (u64)-1) {
+ if (action == BTRFS_DROP_DELAYED_REF)
+ count_mod = -1;
+ else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ count_mod = 0;
+ }
+
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+ * the reserved accounting when the extent is finally added, or
+ * if a later modification deletes the delayed ref without ever
+ * inserting the extent into the extent allocation tree.
+ * ref->must_insert_reserved is the flag used to record
+ * that accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+ */
+ if (action == BTRFS_ADD_DELAYED_EXTENT) {
+ must_insert_reserved = 1;
+ action = BTRFS_ADD_DELAYED_REF;
+ } else {
+ must_insert_reserved = 0;
+ }
+
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->parent = parent;
+ ref->ref_mod = count_mod;
+ ref->in_tree = 1;
+ ref->num_bytes = num_bytes;
+
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head_ref = btrfs_delayed_node_to_head(ref);
+ head_ref->must_insert_reserved = must_insert_reserved;
+ INIT_LIST_HEAD(&head_ref->cluster);
+ mutex_init(&head_ref->mutex);
+ } else {
+ full_ref = btrfs_delayed_node_to_ref(ref);
+ full_ref->root = ref_root;
+ full_ref->generation = ref_generation;
+ full_ref->owner_objectid = owner_objectid;
+ full_ref->pin = pin;
+ full_ref->action = action;
+ }
+
+ existing = tree_insert(&delayed_refs->root, bytenr,
+ parent, &ref->rb_node);
+
+ if (existing) {
+ if (btrfs_delayed_ref_is_head(ref))
+ update_existing_head_ref(existing, ref);
+ else
+ update_existing_ref(trans, delayed_refs, existing, ref);
+
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kfree(ref);
+ } else {
+ if (btrfs_delayed_ref_is_head(ref)) {
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ }
+ delayed_refs->num_entries++;
+ trans->delayed_ref_updates++;
+ }
+ return 0;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0, action, pin);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, action, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref)
+ return btrfs_delayed_node_to_head(ref);
+ return NULL;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref *old_ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+ if (!old_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+ if (orig_parent == 0)
+ orig_parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ kfree(old_ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+ orig_parent, orig_ref_root,
+ orig_ref_generation, owner_objectid,
+ BTRFS_DROP_DELAYED_REF, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 00000000000..3bec2ff0b15
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+ struct rb_node rb_node;
+
+ /* the starting bytenr of the extent */
+ u64 bytenr;
+
+ /* the parent our backref will point to */
+ u64 parent;
+
+ /* the size of the extent */
+ u64 num_bytes;
+
+ /* ref count on this data structure */
+ atomic_t refs;
+
+ /*
+ * how many refs is this entry adding or deleting. For
+ * head refs, this may be a negative number because it is keeping
+ * track of the total mods done to the reference count.
+ * For individual refs, this will always be a positive number
+ *
+ * It may be more than one, since it is possible for a single
+ * parent to have more than one ref on an extent
+ */
+ int ref_mod;
+
+ /* is this node still in the rbtree? */
+ unsigned int in_tree:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent. They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+ struct btrfs_delayed_ref_node node;
+
+ /*
+ * the mutex is held while running the refs, and it is also
+ * held when checking the sum of reference modifications.
+ */
+ struct mutex mutex;
+
+ struct list_head cluster;
+
+ /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+ * used to flag a delayed ref so the accounting can be updated
+ * when a full insert is done.
+ *
+ * It is possible the extent will be freed before it is ever
+ * inserted into the extent allocation tree. In this case
+ * we need to update the in ram accounting to properly reflect
+ * the free has happened.
+ */
+ unsigned int must_insert_reserved:1;
+};
+
+struct btrfs_delayed_ref {
+ struct btrfs_delayed_ref_node node;
+
+ /* the root objectid our ref will point to */
+ u64 root;
+
+ /* the generation for the backref */
+ u64 generation;
+
+ /* owner_objectid of the backref */
+ u64 owner_objectid;
+
+ /* operation done by this entry in the rbtree */
+ u8 action;
+
+ /* if pin == 1, when the extent is freed it will be pinned until
+ * transaction commit
+ */
+ unsigned int pin:1;
+};
+
+struct btrfs_delayed_ref_root {
+ struct rb_root root;
+
+ /* this spin lock protects the rbtree and the entries inside */
+ spinlock_t lock;
+
+ /* how many delayed ref updates we've queued, used by the
+ * throttling code
+ */
+ unsigned long num_entries;
+
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
+ /*
+ * set when the tree is flushing before a transaction commit,
+ * used by the throttling code to decide if new updates need
+ * to be run right away
+ */
+ int flushing;
+
+ u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ WARN_ON(atomic_read(&ref->refs) == 0);
+ if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(ref->in_tree);
+ kfree(ref);
+ }
+}
+
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+ return node->parent == (u64)-1;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref, node);
+
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(!btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref_head, node);
+
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7..1d70236ba00 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
key.objectid = dir;
btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
key.offset = btrfs_name_hash(name, name_len);
+
path = btrfs_alloc_path();
+ path->leave_spinning = 1;
+
data_size = sizeof(*dir_item) + name_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc86..92d73929d38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
static int btree_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct extent_buffer *eb;
+ int was_dirty;
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (!(current->flags & PF_MEMALLOC)) {
+ return extent_write_full_page(tree, page,
+ btree_get_extent, wbc);
+ }
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
+ redirty_page_for_writepage(wbc, page);
+ eb = btrfs_find_tree_block(root, page_offset(page),
+ PAGE_CACHE_SIZE);
+ WARN_ON(!eb);
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- return extent_write_full_page(tree, page, btree_get_extent, wbc);
+ free_extent_buffer(eb);
+
+ unlock_page(page);
+ return 0;
}
static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
if (wbc->sync_mode == WB_SYNC_NONE) {
+ struct btrfs_root *root = BTRFS_I(mapping->host)->root;
u64 num_dirty;
- u64 start = 0;
unsigned long thresh = 32 * 1024 * 1024;
if (wbc->for_kupdate)
return 0;
- num_dirty = count_range_bits(tree, &start, (u64)-1,
- thresh, EXTENT_DIRTY);
+ /* this is a bit racy, but that's ok */
+ num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty < thresh)
return 0;
}
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
root->fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
- /* ugh, clear_extent_buffer_dirty can be expensive */
- btrfs_set_lock_blocking(buf);
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= buf->len)
+ root->fs_info->dirty_metadata_bytes -= buf->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
buf);
}
@@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg)
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
- printk(KERN_INFO "btrfs: total reference cache "
- "size %llu\n",
- root->fs_info->total_ref_cache_size);
- }
-
mutex_lock(&root->fs_info->trans_mutex);
cur = root->fs_info->running_transaction;
if (!cur) {
@@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg)
mutex_unlock(&root->fs_info->trans_mutex);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
+
sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->ordered_operations);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
extent_io_tree_init(&fs_info->pinned_extents,
fs_info->btree_inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&fs_info->pending_del,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&fs_info->extent_ins,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->do_barriers = 1;
INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
+ mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->drop_mutex);
- mutex_init(&fs_info->extent_ins_mutex);
mutex_init(&fs_info->pinned_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
u64 transid = btrfs_header_generation(buf);
struct inode *btree_inode = root->fs_info->btree_inode;
-
- btrfs_set_lock_blocking(buf);
+ int was_dirty;
btrfs_assert_tree_locked(buf);
if (transid != root->fs_info->generation) {
@@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)root->fs_info->generation);
WARN_ON(1);
}
- set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+ was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += buf->len;
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
}
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
int btree_lock_page_hook(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_buffer *eb;
unsigned long len;
@@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_lock(eb);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= eb->len)
+ root->fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227b..c958ecbc191 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad205..f5e7cae63d8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,17 +49,23 @@ struct pending_extent_op {
int del;
};
-static int finish_current_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins,
+ int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin,
+ int ref_to_drop);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +560,13 @@ out:
return ret;
}
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct list_head *update_list)
-{
- struct btrfs_key key;
- struct btrfs_extent_ref *ref;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct pending_extent_op *op;
- struct extent_buffer *leaf;
- int ret = 0;
- struct list_head *cur = update_list->next;
- u64 ref_objectid;
- u64 ref_root = extent_root->root_key.objectid;
-
- op = list_entry(cur, struct pending_extent_op, list);
-
-search:
- key.objectid = op->bytenr;
- key.type = BTRFS_EXTENT_REF_KEY;
- key.offset = op->orig_parent;
-
- ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
- BUG_ON(ret);
-
- leaf = path->nodes[0];
-
-loop:
- ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
- ref_objectid = btrfs_ref_objectid(leaf, ref);
-
- if (btrfs_ref_root(leaf, ref) != ref_root ||
- btrfs_ref_generation(leaf, ref) != op->orig_generation ||
- (ref_objectid != op->level &&
- ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
- printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
- "root %llu, owner %u\n",
- (unsigned long long)op->bytenr,
- (unsigned long long)op->orig_parent,
- (unsigned long long)ref_root, op->level);
- btrfs_print_leaf(extent_root, leaf);
- BUG();
- }
-
- key.objectid = op->bytenr;
- key.offset = op->parent;
- key.type = BTRFS_EXTENT_REF_KEY;
- ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
- BUG_ON(ret);
- ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
- btrfs_set_ref_generation(leaf, ref, op->generation);
-
- cur = cur->next;
-
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
-
- if (cur == update_list) {
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(extent_root, path);
- goto out;
- }
-
- op = list_entry(cur, struct pending_extent_op, list);
-
- path->slots[0]++;
- while (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid == op->bytenr &&
- key.type == BTRFS_EXTENT_REF_KEY)
- goto loop;
- path->slots[0]++;
- }
-
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(extent_root, path);
- goto search;
-
-out:
- return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct list_head *insert_list, int nr)
-{
- struct btrfs_key *keys;
- u32 *data_size;
- struct pending_extent_op *op;
- struct extent_buffer *leaf;
- struct list_head *cur = insert_list->next;
- struct btrfs_fs_info *info = extent_root->fs_info;
- u64 ref_root = extent_root->root_key.objectid;
- int i = 0, last = 0, ret;
- int total = nr * 2;
-
- if (!nr)
- return 0;
-
- keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
- if (!keys)
- return -ENOMEM;
-
- data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
- if (!data_size) {
- kfree(keys);
- return -ENOMEM;
- }
-
- list_for_each_entry(op, insert_list, list) {
- keys[i].objectid = op->bytenr;
- keys[i].offset = op->num_bytes;
- keys[i].type = BTRFS_EXTENT_ITEM_KEY;
- data_size[i] = sizeof(struct btrfs_extent_item);
- i++;
-
- keys[i].objectid = op->bytenr;
- keys[i].offset = op->parent;
- keys[i].type = BTRFS_EXTENT_REF_KEY;
- data_size[i] = sizeof(struct btrfs_extent_ref);
- i++;
- }
-
- op = list_entry(cur, struct pending_extent_op, list);
- i = 0;
- while (i < total) {
- int c;
- ret = btrfs_insert_some_items(trans, extent_root, path,
- keys+i, data_size+i, total-i);
- BUG_ON(ret < 0);
-
- if (last && ret > 1)
- BUG();
-
- leaf = path->nodes[0];
- for (c = 0; c < ret; c++) {
- int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
- /*
- * if the first item we inserted was a backref, then
- * the EXTENT_ITEM will be the odd c's, else it will
- * be the even c's
- */
- if ((ref_first && (c % 2)) ||
- (!ref_first && !(c % 2))) {
- struct btrfs_extent_item *itm;
-
- itm = btrfs_item_ptr(leaf, path->slots[0] + c,
- struct btrfs_extent_item);
- btrfs_set_extent_refs(path->nodes[0], itm, 1);
- op->del++;
- } else {
- struct btrfs_extent_ref *ref;
-
- ref = btrfs_item_ptr(leaf, path->slots[0] + c,
- struct btrfs_extent_ref);
- btrfs_set_ref_root(leaf, ref, ref_root);
- btrfs_set_ref_generation(leaf, ref,
- op->generation);
- btrfs_set_ref_objectid(leaf, ref, op->level);
- btrfs_set_ref_num_refs(leaf, ref, 1);
- op->del++;
- }
-
- /*
- * using del to see when its ok to free up the
- * pending_extent_op. In the case where we insert the
- * last item on the list in order to help do batching
- * we need to not free the extent op until we actually
- * insert the extent_item
- */
- if (op->del == 2) {
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1,
- GFP_NOFS);
- cur = cur->next;
- list_del_init(&op->list);
- kfree(op);
- if (cur != insert_list)
- op = list_entry(cur,
- struct pending_extent_op,
- list);
- }
- }
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(extent_root, path);
-
- /*
- * Ok backref's and items usually go right next to eachother,
- * but if we could only insert 1 item that means that we
- * inserted on the end of a leaf, and we have no idea what may
- * be on the next leaf so we just play it safe. In order to
- * try and help this case we insert the last thing on our
- * insert list so hopefully it will end up being the last
- * thing on the leaf and everything else will be before it,
- * which will let us insert a whole bunch of items at the same
- * time.
- */
- if (ret == 1 && !last && (i + ret < total)) {
- /*
- * last: where we will pick up the next time around
- * i: our current key to insert, will be total - 1
- * cur: the current op we are screwing with
- * op: duh
- */
- last = i + ret;
- i = total - 1;
- cur = insert_list->prev;
- op = list_entry(cur, struct pending_extent_op, list);
- } else if (last) {
- /*
- * ok we successfully inserted the last item on the
- * list, lets reset everything
- *
- * i: our current key to insert, so where we left off
- * last time
- * last: done with this
- * cur: the op we are messing with
- * op: duh
- * total: since we inserted the last key, we need to
- * decrement total so we dont overflow
- */
- i = last;
- last = 0;
- total--;
- if (i < total) {
- cur = insert_list->next;
- op = list_entry(cur, struct pending_extent_op,
- list);
- }
- } else {
- i += ret;
- }
-
- cond_resched();
- }
- ret = 0;
- kfree(keys);
- kfree(data_size);
- return ret;
-}
-
static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 ref_root, u64 ref_generation,
- u64 owner_objectid)
+ u64 owner_objectid,
+ int refs_to_add)
{
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
btrfs_set_ref_root(leaf, ref, ref_root);
btrfs_set_ref_generation(leaf, ref, ref_generation);
btrfs_set_ref_objectid(leaf, ref, owner_objectid);
- btrfs_set_ref_num_refs(leaf, ref, 1);
+ btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
} else if (ret == -EEXIST) {
u64 existing_owner;
+
BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
num_refs = btrfs_ref_num_refs(leaf, ref);
BUG_ON(num_refs == 0);
- btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+ btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
existing_owner = btrfs_ref_objectid(leaf, ref);
if (existing_owner != owner_objectid &&
@@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
} else {
goto out;
}
+ btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
btrfs_release_path(root, path);
@@ -865,7 +624,8 @@ out:
static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ int refs_to_drop)
{
struct extent_buffer *leaf;
struct btrfs_extent_ref *ref;
@@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
num_refs = btrfs_ref_num_refs(leaf, ref);
- BUG_ON(num_refs == 0);
- num_refs -= 1;
+ BUG_ON(num_refs < refs_to_drop);
+ num_refs -= refs_to_drop;
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
} else {
@@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
#endif
}
-static noinline int free_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct list_head *del_list)
-{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_path *path;
- struct btrfs_key key, found_key;
- struct extent_buffer *leaf;
- struct list_head *cur;
- struct pending_extent_op *op;
- struct btrfs_extent_item *ei;
- int ret, num_to_del, extent_slot = 0, found_extent = 0;
- u32 refs;
- u64 bytes_freed = 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = 1;
-
-search:
- /* search for the backref for the current ref we want to delete */
- cur = del_list->next;
- op = list_entry(cur, struct pending_extent_op, list);
- ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
- op->orig_parent,
- extent_root->root_key.objectid,
- op->orig_generation, op->level, 1);
- if (ret) {
- printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
- "root %llu gen %llu owner %u\n",
- (unsigned long long)op->bytenr,
- (unsigned long long)extent_root->root_key.objectid,
- (unsigned long long)op->orig_generation, op->level);
- btrfs_print_leaf(extent_root, path->nodes[0]);
- WARN_ON(1);
- goto out;
- }
-
- extent_slot = path->slots[0];
- num_to_del = 1;
- found_extent = 0;
-
- /*
- * if we aren't the first item on the leaf we can move back one and see
- * if our ref is right next to our extent item
- */
- if (likely(extent_slot)) {
- extent_slot--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- extent_slot);
- if (found_key.objectid == op->bytenr &&
- found_key.type == BTRFS_EXTENT_ITEM_KEY &&
- found_key.offset == op->num_bytes) {
- num_to_del++;
- found_extent = 1;
- }
- }
-
- /*
- * if we didn't find the extent we need to delete the backref and then
- * search for the extent item key so we can update its ref count
- */
- if (!found_extent) {
- key.objectid = op->bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = op->num_bytes;
-
- ret = remove_extent_backref(trans, extent_root, path);
- BUG_ON(ret);
- btrfs_release_path(extent_root, path);
- ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
- BUG_ON(ret);
- extent_slot = path->slots[0];
- }
-
- /* this is where we update the ref count for the extent */
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
- refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs == 0);
- refs--;
- btrfs_set_extent_refs(leaf, ei, refs);
-
- btrfs_mark_buffer_dirty(leaf);
-
- /*
- * This extent needs deleting. The reason cur_slot is extent_slot +
- * num_to_del is because extent_slot points to the slot where the extent
- * is, and if the backref was not right next to the extent we will be
- * deleting at least 1 item, and will want to start searching at the
- * slot directly next to extent_slot. However if we did find the
- * backref next to the extent item them we will be deleting at least 2
- * items and will want to start searching directly after the ref slot
- */
- if (!refs) {
- struct list_head *pos, *n, *end;
- int cur_slot = extent_slot+num_to_del;
- u64 super_used;
- u64 root_used;
-
- path->slots[0] = extent_slot;
- bytes_freed = op->num_bytes;
-
- mutex_lock(&info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, op->bytenr,
- op->num_bytes, op->level >=
- BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&info->pinned_mutex);
- BUG_ON(ret < 0);
- op->del = ret;
-
- /*
- * we need to see if we can delete multiple things at once, so
- * start looping through the list of extents we are wanting to
- * delete and see if their extent/backref's are right next to
- * eachother and the extents only have 1 ref
- */
- for (pos = cur->next; pos != del_list; pos = pos->next) {
- struct pending_extent_op *tmp;
-
- tmp = list_entry(pos, struct pending_extent_op, list);
-
- /* we only want to delete extent+ref at this stage */
- if (cur_slot >= btrfs_header_nritems(leaf) - 1)
- break;
-
- btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
- if (found_key.objectid != tmp->bytenr ||
- found_key.type != BTRFS_EXTENT_ITEM_KEY ||
- found_key.offset != tmp->num_bytes)
- break;
-
- /* check to make sure this extent only has one ref */
- ei = btrfs_item_ptr(leaf, cur_slot,
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, ei) != 1)
- break;
-
- btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
- if (found_key.objectid != tmp->bytenr ||
- found_key.type != BTRFS_EXTENT_REF_KEY ||
- found_key.offset != tmp->orig_parent)
- break;
-
- /*
- * the ref is right next to the extent, we can set the
- * ref count to 0 since we will delete them both now
- */
- btrfs_set_extent_refs(leaf, ei, 0);
-
- /* pin down the bytes for this extent */
- mutex_lock(&info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
- tmp->num_bytes, tmp->level >=
- BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&info->pinned_mutex);
- BUG_ON(ret < 0);
-
- /*
- * use the del field to tell if we need to go ahead and
- * free up the extent when we delete the item or not.
- */
- tmp->del = ret;
- bytes_freed += tmp->num_bytes;
-
- num_to_del += 2;
- cur_slot += 2;
- }
- end = pos;
-
- /* update the free space counters */
- spin_lock(&info->delalloc_lock);
- super_used = btrfs_super_bytes_used(&info->super_copy);
- btrfs_set_super_bytes_used(&info->super_copy,
- super_used - bytes_freed);
-
- root_used = btrfs_root_used(&extent_root->root_item);
- btrfs_set_root_used(&extent_root->root_item,
- root_used - bytes_freed);
- spin_unlock(&info->delalloc_lock);
-
- /* delete the items */
- ret = btrfs_del_items(trans, extent_root, path,
- path->slots[0], num_to_del);
- BUG_ON(ret);
-
- /*
- * loop through the extents we deleted and do the cleanup work
- * on them
- */
- for (pos = cur, n = pos->next; pos != end;
- pos = n, n = pos->next) {
- struct pending_extent_op *tmp;
- tmp = list_entry(pos, struct pending_extent_op, list);
-
- /*
- * remember tmp->del tells us wether or not we pinned
- * down the extent
- */
- ret = update_block_group(trans, extent_root,
- tmp->bytenr, tmp->num_bytes, 0,
- tmp->del);
- BUG_ON(ret);
-
- list_del_init(&tmp->list);
- unlock_extent(&info->extent_ins, tmp->bytenr,
- tmp->bytenr + tmp->num_bytes - 1,
- GFP_NOFS);
- kfree(tmp);
- }
- } else if (refs && found_extent) {
- /*
- * the ref and extent were right next to eachother, but the
- * extent still has a ref, so just free the backref and keep
- * going
- */
- ret = remove_extent_backref(trans, extent_root, path);
- BUG_ON(ret);
-
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
- } else {
- /*
- * the extent has multiple refs and the backref we were looking
- * for was not right next to it, so just unlock and go next,
- * we're good to go
- */
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
- }
-
- btrfs_release_path(extent_root, path);
- if (!list_empty(del_list))
- goto search;
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes,
u64 orig_parent, u64 parent,
u64 orig_root, u64 ref_root,
u64 orig_generation, u64 ref_generation,
u64 owner_objectid)
{
int ret;
- struct btrfs_root *extent_root = root->fs_info->extent_root;
- struct btrfs_path *path;
-
- if (root == root->fs_info->extent_root) {
- struct pending_extent_op *extent_op;
- u64 num_bytes;
-
- BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
- num_bytes = btrfs_level_size(root, (int)owner_objectid);
- mutex_lock(&root->fs_info->extent_ins_mutex);
- if (test_range_bit(&root->fs_info->extent_ins, bytenr,
- bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
- u64 priv;
- ret = get_state_private(&root->fs_info->extent_ins,
- bytenr, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
- BUG_ON(extent_op->parent != orig_parent);
- BUG_ON(extent_op->generation != orig_generation);
+ int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
- extent_op->parent = parent;
- extent_op->generation = ref_generation;
- } else {
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_BACKREF_UPDATE;
- extent_op->bytenr = bytenr;
- extent_op->num_bytes = num_bytes;
- extent_op->parent = parent;
- extent_op->orig_parent = orig_parent;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = orig_generation;
- extent_op->level = (int)owner_objectid;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- set_extent_bits(&root->fs_info->extent_ins,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->extent_ins,
- bytenr, (unsigned long)extent_op);
- }
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = lookup_extent_backref(trans, extent_root, path,
- bytenr, orig_parent, orig_root,
- orig_generation, owner_objectid, 1);
- if (ret)
- goto out;
- ret = remove_extent_backref(trans, extent_root, path);
- if (ret)
- goto out;
- ret = insert_extent_backref(trans, extent_root, path, bytenr,
- parent, ref_root, ref_generation,
- owner_objectid);
+ ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+ orig_parent, parent, orig_root,
+ ref_root, orig_generation,
+ ref_generation, owner_objectid, pin);
BUG_ON(ret);
- finish_current_insert(trans, extent_root, 0);
- del_pending_extents(trans, extent_root, 0);
-out:
- btrfs_free_path(path);
return ret;
}
int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
- u64 orig_parent, u64 parent,
+ u64 num_bytes, u64 orig_parent, u64 parent,
u64 ref_root, u64 ref_generation,
u64 owner_objectid)
{
@@ -1260,20 +716,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
return 0;
- ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
- parent, ref_root, ref_root,
- ref_generation, ref_generation,
- owner_objectid);
+
+ ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+ orig_parent, parent, ref_root,
+ ref_root, ref_generation,
+ ref_generation, owner_objectid);
return ret;
}
-
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes,
u64 orig_parent, u64 parent,
u64 orig_root, u64 ref_root,
u64 orig_generation, u64 ref_generation,
u64 owner_objectid)
{
+ int ret;
+
+ ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+ ref_generation, owner_objectid,
+ BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+ return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid,
+ int refs_to_add)
+{
struct btrfs_path *path;
int ret;
struct btrfs_key key;
@@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = 1;
+ path->leave_spinning = 1;
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)-1;
+ key.offset = num_bytes;
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
- 0, 1);
- if (ret < 0)
+ /* first find the extent item and update its reference count */
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+ path, 0, 1);
+ if (ret < 0) {
+ btrfs_set_path_blocking(path);
return ret;
- BUG_ON(ret == 0 || path->slots[0] == 0);
+ }
- path->slots[0]--;
+ if (ret > 0) {
+ WARN_ON(1);
+ btrfs_free_path(path);
+ return -EIO;
+ }
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
refs = btrfs_extent_refs(l, item);
- btrfs_set_extent_refs(l, item, refs + 1);
+ btrfs_set_extent_refs(l, item, refs + refs_to_add);
+ btrfs_unlock_up_safe(path, 1);
+
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(root->fs_info->extent_root, path);
path->reada = 1;
+ path->leave_spinning = 1;
+
+ /* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent,
ref_root, ref_generation,
- owner_objectid);
+ owner_objectid, refs_to_add);
BUG_ON(ret);
- finish_current_insert(trans, root->fs_info->extent_root, 0);
- del_pending_extents(trans, root->fs_info->extent_root, 0);
-
btrfs_free_path(path);
return 0;
}
@@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
return 0;
- ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+
+ ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
0, ref_root, 0, ref_generation,
owner_objectid);
return ret;
}
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node)
+{
+ int ret = 0;
+ struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+ BUG_ON(node->ref_mod == 0);
+ ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+ node->parent, ref->root, ref->generation,
+ ref->owner_objectid, ref->pin, node->ref_mod);
+
+ return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ int insert_reserved)
{
- u64 start;
- u64 end;
int ret;
+ struct btrfs_delayed_ref *ref;
+
+ if (node->parent == (u64)-1) {
+ struct btrfs_delayed_ref_head *head;
+ /*
+ * we've hit the end of the chain and we were supposed
+ * to insert this extent into the tree. But, it got
+ * deleted before we ever needed to insert it, so all
+ * we have to do is clean up the accounting
+ */
+ if (insert_reserved) {
+ update_reserved_extents(root, node->bytenr,
+ node->num_bytes, 0);
+ }
+ head = btrfs_delayed_node_to_head(node);
+ mutex_unlock(&head->mutex);
+ return 0;
+ }
- while(1) {
- finish_current_insert(trans, root->fs_info->extent_root, 1);
- del_pending_extents(trans, root->fs_info->extent_root, 1);
+ ref = btrfs_delayed_node_to_ref(node);
+ if (ref->action == BTRFS_ADD_DELAYED_REF) {
+ if (insert_reserved) {
+ struct btrfs_key ins;
- /* is there more work to do? */
- ret = find_first_extent_bit(&root->fs_info->pending_del,
- 0, &start, &end, EXTENT_WRITEBACK);
- if (!ret)
- continue;
- ret = find_first_extent_bit(&root->fs_info->extent_ins,
- 0, &start, &end, EXTENT_WRITEBACK);
- if (!ret)
- continue;
- break;
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ /* record the full extent allocation */
+ ret = __btrfs_alloc_reserved_extent(trans, root,
+ node->parent, ref->root,
+ ref->generation, ref->owner_objectid,
+ &ins, node->ref_mod);
+ update_reserved_extents(root, node->bytenr,
+ node->num_bytes, 0);
+ } else {
+ /* just add one backref */
+ ret = add_extent_ref(trans, root, node->bytenr,
+ node->num_bytes,
+ node->parent, ref->root, ref->generation,
+ ref->owner_objectid, node->ref_mod);
+ }
+ BUG_ON(ret);
+ } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+ WARN_ON(insert_reserved);
+ ret = drop_delayed_ref(trans, root, node);
}
return 0;
}
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u32 *refs)
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct btrfs_path *path;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ int action = BTRFS_ADD_DELAYED_REF;
+again:
+ /*
+ * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * this prevents ref count from going down to zero when
+ * there still are pending delayed ref.
+ */
+ node = rb_prev(&head->node.rb_node);
+ while (1) {
+ if (!node)
+ break;
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr != head->node.bytenr)
+ break;
+ if (btrfs_delayed_node_to_ref(ref)->action == action)
+ return ref;
+ node = rb_prev(node);
+ }
+ if (action == BTRFS_ADD_DELAYED_REF) {
+ action = BTRFS_DROP_DELAYED_REF;
+ goto again;
+ }
+ return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct list_head *cluster)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
- struct btrfs_key key;
- struct extent_buffer *l;
- struct btrfs_extent_item *item;
+ int count = 0;
+ int must_insert_reserved = 0;
- WARN_ON(num_bytes < root->sectorsize);
- path = btrfs_alloc_path();
- path->reada = 1;
- key.objectid = bytenr;
- key.offset = num_bytes;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
- 0, 0);
- if (ret < 0)
- goto out;
- if (ret != 0) {
- btrfs_print_leaf(root, path->nodes[0]);
- printk(KERN_INFO "btrfs failed to find block number %llu\n",
- (unsigned long long)bytenr);
- BUG();
+ delayed_refs = &trans->transaction->delayed_refs;
+ while (1) {
+ if (!locked_ref) {
+ /* pick a new head ref from the cluster list */
+ if (list_empty(cluster))
+ break;
+
+ locked_ref = list_entry(cluster->next,
+ struct btrfs_delayed_ref_head, cluster);
+
+ /* grab the lock that says we are going to process
+ * all the refs for this head */
+ ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+ /*
+ * we may have dropped the spin lock to get the head
+ * mutex lock, and that might have given someone else
+ * time to free the head. If that's true, it has been
+ * removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN) {
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
+ }
+
+ /*
+ * record the must insert reserved flag before we
+ * drop the spin lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ /*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+ if (!ref) {
+ /* All delayed refs have been processed, Go ahead
+ * and send the head node to run_one_delayed_ref,
+ * so that any accounting fixes can happen
+ */
+ ref = &locked_ref->node;
+ list_del_init(&locked_ref->cluster);
+ locked_ref = NULL;
+ }
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root, ref,
+ must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(ref);
+
+ count++;
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct list_head cluster;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+ int run_most = 0;
+
+ if (root == root->fs_info->extent_root)
+ root = root->fs_info->tree_root;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ INIT_LIST_HEAD(&cluster);
+again:
+ spin_lock(&delayed_refs->lock);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+ while (1) {
+ if (!(run_all || run_most) &&
+ delayed_refs->num_heads_ready < 64)
+ break;
+
+ /*
+ * go find something we can process in the rbtree. We start at
+ * the beginning of the tree, and then build a cluster
+ * of refs to process starting at the first one we are able to
+ * lock
+ */
+ ret = btrfs_find_ref_cluster(trans, &cluster,
+ delayed_refs->run_delayed_start);
+ if (ret)
+ break;
+
+ ret = run_clustered_refs(trans, root, &cluster);
+ BUG_ON(ret < 0);
+
+ count -= min_t(unsigned long, ret, count);
+
+ if (count == 0)
+ break;
+ }
+
+ if (run_all) {
+ node = rb_first(&delayed_refs->root);
+ if (!node)
+ goto out;
+ count = (unsigned long)-1;
+
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ atomic_inc(&ref->refs);
+
+ spin_unlock(&delayed_refs->lock);
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+
+ btrfs_put_delayed_ref(ref);
+ cond_resched();
+ goto again;
+ }
+ node = rb_next(node);
+ }
+ spin_unlock(&delayed_refs->lock);
+ schedule_timeout(1);
+ goto again;
}
- l = path->nodes[0];
- item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
- *refs = btrfs_extent_refs(l, item);
out:
- btrfs_free_path(path);
+ spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
int refi = 0;
int slot;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, u64, u64);
+ u64, u64, u64, u64, u64, u64, u64, u64, u64);
ref_root = btrfs_header_owner(buf);
ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, slot);
+ fi = btrfs_item_ptr(buf, slot,
+ struct btrfs_file_extent_item);
+
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
ret = process_func(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
- orig_generation, ref_generation,
- key.objectid);
+ btrfs_file_extent_disk_num_bytes(buf, fi),
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ key.objectid);
if (ret) {
faili = slot;
@@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
goto fail;
}
} else {
- ret = process_func(trans, root, bytenr,
+ ret = process_func(trans, root, bytenr, buf->len,
orig_buf->start, buf->start,
orig_root, ref_root,
orig_generation, ref_generation,
@@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
if (bytenr == 0)
continue;
ret = __btrfs_update_extent_ref(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
- orig_generation, ref_generation,
- key.objectid);
+ btrfs_file_extent_disk_num_bytes(buf, fi),
+ orig_buf->start, buf->start,
+ orig_root, ref_root, orig_generation,
+ ref_generation, key.objectid);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, slot);
ret = __btrfs_update_extent_ref(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
+ buf->len, orig_buf->start,
+ buf->start, orig_root, ref_root,
orig_generation, ref_generation,
level - 1);
if (ret)
@@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache)
{
int ret;
- int pending_ret;
struct btrfs_root *extent_root = root->fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
@@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(extent_root, path);
fail:
- finish_current_insert(trans, extent_root, 0);
- pending_ret = del_pending_extents(trans, extent_root, 0);
if (ret)
return ret;
- if (pending_ret)
- return pending_ret;
return 0;
}
@@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
clear_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + num - 1, GFP_NOFS);
}
+ mutex_unlock(&root->fs_info->pinned_mutex);
+
while (num > 0) {
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache);
@@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
u64 end;
int ret;
- mutex_lock(&root->fs_info->pinned_mutex);
while (1) {
+ mutex_lock(&root->fs_info->pinned_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
if (ret)
@@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
+ /* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- if (need_resched()) {
- mutex_unlock(&root->fs_info->pinned_mutex);
- cond_resched();
- mutex_lock(&root->fs_info->pinned_mutex);
- }
+ cond_resched();
}
mutex_unlock(&root->fs_info->pinned_mutex);
return ret;
}
-static int finish_current_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all)
-{
- u64 start;
- u64 end;
- u64 priv;
- u64 search = 0;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_path *path;
- struct pending_extent_op *extent_op, *tmp;
- struct list_head insert_list, update_list;
- int ret;
- int num_inserts = 0, max_inserts, restart = 0;
-
- path = btrfs_alloc_path();
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
-
- max_inserts = extent_root->leafsize /
- (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
- sizeof(struct btrfs_extent_ref) +
- sizeof(struct btrfs_extent_item));
-again:
- mutex_lock(&info->extent_ins_mutex);
- while (1) {
- ret = find_first_extent_bit(&info->extent_ins, search, &start,
- &end, EXTENT_WRITEBACK);
- if (ret) {
- if (restart && !num_inserts &&
- list_empty(&update_list)) {
- restart = 0;
- search = 0;
- continue;
- }
- break;
- }
-
- ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
- if (!ret) {
- if (all)
- restart = 1;
- search = end + 1;
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
- continue;
- }
-
- ret = get_state_private(&info->extent_ins, start, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
- if (extent_op->type == PENDING_EXTENT_INSERT) {
- num_inserts++;
- list_add_tail(&extent_op->list, &insert_list);
- search = end + 1;
- if (num_inserts == max_inserts) {
- restart = 1;
- break;
- }
- } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
- list_add_tail(&extent_op->list, &update_list);
- search = end + 1;
- } else {
- BUG();
- }
- }
-
- /*
- * process the update list, clear the writeback bit for it, and if
- * somebody marked this thing for deletion then just unlock it and be
- * done, the free_extents will handle it
- */
- list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
- clear_extent_bits(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- if (extent_op->del) {
- list_del_init(&extent_op->list);
- unlock_extent(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes
- - 1, GFP_NOFS);
- kfree(extent_op);
- }
- }
- mutex_unlock(&info->extent_ins_mutex);
-
- /*
- * still have things left on the update list, go ahead an update
- * everything
- */
- if (!list_empty(&update_list)) {
- ret = update_backrefs(trans, extent_root, path, &update_list);
- BUG_ON(ret);
-
- /* we may have COW'ed new blocks, so lets start over */
- if (all)
- restart = 1;
- }
-
- /*
- * if no inserts need to be done, but we skipped some extents and we
- * need to make sure everything is cleaned then reset everything and
- * go back to the beginning
- */
- if (!num_inserts && restart) {
- search = 0;
- restart = 0;
- INIT_LIST_HEAD(&update_list);
- INIT_LIST_HEAD(&insert_list);
- goto again;
- } else if (!num_inserts) {
- goto out;
- }
-
- /*
- * process the insert extents list. Again if we are deleting this
- * extent, then just unlock it, pin down the bytes if need be, and be
- * done with it. Saves us from having to actually insert the extent
- * into the tree and then subsequently come along and delete it
- */
- mutex_lock(&info->extent_ins_mutex);
- list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
- clear_extent_bits(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- if (extent_op->del) {
- u64 used;
- list_del_init(&extent_op->list);
- unlock_extent(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes
- - 1, GFP_NOFS);
-
- mutex_lock(&extent_root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root,
- extent_op->bytenr,
- extent_op->num_bytes, 0);
- mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
- spin_lock(&info->delalloc_lock);
- used = btrfs_super_bytes_used(&info->super_copy);
- btrfs_set_super_bytes_used(&info->super_copy,
- used - extent_op->num_bytes);
- used = btrfs_root_used(&extent_root->root_item);
- btrfs_set_root_used(&extent_root->root_item,
- used - extent_op->num_bytes);
- spin_unlock(&info->delalloc_lock);
-
- ret = update_block_group(trans, extent_root,
- extent_op->bytenr,
- extent_op->num_bytes,
- 0, ret > 0);
- BUG_ON(ret);
- kfree(extent_op);
- num_inserts--;
- }
- }
- mutex_unlock(&info->extent_ins_mutex);
-
- ret = insert_extents(trans, extent_root, path, &insert_list,
- num_inserts);
- BUG_ON(ret);
-
- /*
- * if restart is set for whatever reason we need to go back and start
- * searching through the pending list again.
- *
- * We just inserted some extents, which could have resulted in new
- * blocks being allocated, which would result in new blocks needing
- * updates, so if all is set we _must_ restart to get the updated
- * blocks.
- */
- if (restart || all) {
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
- search = 0;
- restart = 0;
- num_inserts = 0;
- goto again;
- }
-out:
- btrfs_free_path(path);
- return 0;
-}
-
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data)
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes, int is_data,
+ struct extent_buffer **must_clean)
{
int err = 0;
struct extent_buffer *buf;
@@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
u64 header_transid = btrfs_header_generation(buf);
if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+ header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
header_transid == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- clean_tree_block(NULL, root, buf);
- btrfs_tree_unlock(buf);
- free_extent_buffer(buf);
+ *must_clean = buf;
return 1;
}
btrfs_tree_unlock(buf);
}
free_extent_buffer(buf);
pinit:
+ btrfs_set_path_blocking(path);
+ mutex_lock(&root->fs_info->pinned_mutex);
+ /* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
BUG_ON(err < 0);
@@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin, int mark_free)
+ u64 owner_objectid, int pin, int mark_free,
+ int refs_to_drop)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = 1;
+ path->leave_spinning = 1;
ret = lookup_extent_backref(trans, extent_root, path,
bytenr, parent, root_objectid,
ref_generation, owner_objectid, 1);
@@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
break;
}
if (!found_extent) {
- ret = remove_extent_backref(trans, extent_root, path);
+ ret = remove_extent_backref(trans, extent_root, path,
+ refs_to_drop);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, extent_root,
&key, path, -1, 1);
if (ret) {
@@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
- "root %llu gen %llu owner %llu\n",
+ "parent %llu root %llu gen %llu owner %llu\n",
(unsigned long long)bytenr,
+ (unsigned long long)parent,
(unsigned long long)root_objectid,
(unsigned long long)ref_generation,
(unsigned long long)owner_objectid);
@@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs == 0);
- refs -= 1;
- btrfs_set_extent_refs(leaf, ei, refs);
+ /*
+ * we're not allowed to delete the extent item if there
+ * are other delayed ref updates pending
+ */
+
+ BUG_ON(refs < refs_to_drop);
+ refs -= refs_to_drop;
+ btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
- if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+ if (refs == 0 && found_extent &&
+ path->slots[0] == extent_slot + 1) {
struct btrfs_extent_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_ref);
- BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+ BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
/* if the back ref and the extent are next to each other
* they get deleted below in one shot
*/
@@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
num_to_del = 2;
} else if (found_extent) {
/* otherwise delete the extent back ref */
- ret = remove_extent_backref(trans, extent_root, path);
+ ret = remove_extent_backref(trans, extent_root, path,
+ refs_to_drop);
BUG_ON(ret);
/* if refs are 0, we need to setup the path for deletion */
if (refs == 0) {
btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, extent_root, &key, path,
-1, 1);
BUG_ON(ret);
@@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
if (refs == 0) {
u64 super_used;
u64 root_used;
+ struct extent_buffer *must_clean = NULL;
if (pin) {
- mutex_lock(&root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, root, bytenr, num_bytes,
- owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&root->fs_info->pinned_mutex);
+ ret = pin_down_bytes(trans, root, path,
+ bytenr, num_bytes,
+ owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+ &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
}
+
/* block accounting for super block */
spin_lock(&info->delalloc_lock);
super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item,
root_used - num_bytes);
spin_unlock(&info->delalloc_lock);
+
+ /*
+ * it is going to be very rare for someone to be waiting
+ * on the block we're freeing. del_items might need to
+ * schedule, so rather than get fancy, just force it
+ * to blocking here
+ */
+ if (must_clean)
+ btrfs_set_lock_blocking(must_clean);
+
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
+
if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
BUG_ON(ret);
+ } else {
+ invalidate_mapping_pages(info->btree_inode->i_mapping,
+ bytenr >> PAGE_CACHE_SHIFT,
+ (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
btrfs_free_path(path);
- finish_current_insert(trans, extent_root, 0);
return ret;
}
/*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
+ * remove an extent from the root, returns 0 on success
*/
-static int del_pending_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin,
+ int refs_to_drop)
{
- int ret;
- int err = 0;
- u64 start;
- u64 end;
- u64 priv;
- u64 search = 0;
- int nr = 0, skipped = 0;
- struct extent_io_tree *pending_del;
- struct extent_io_tree *extent_ins;
- struct pending_extent_op *extent_op;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct list_head delete_list;
-
- INIT_LIST_HEAD(&delete_list);
- extent_ins = &extent_root->fs_info->extent_ins;
- pending_del = &extent_root->fs_info->pending_del;
-
-again:
- mutex_lock(&info->extent_ins_mutex);
- while (1) {
- ret = find_first_extent_bit(pending_del, search, &start, &end,
- EXTENT_WRITEBACK);
- if (ret) {
- if (all && skipped && !nr) {
- search = 0;
- skipped = 0;
- continue;
- }
- mutex_unlock(&info->extent_ins_mutex);
- break;
- }
-
- ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
- if (!ret) {
- search = end+1;
- skipped = 1;
-
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
-
- continue;
- }
- BUG_ON(ret < 0);
-
- ret = get_state_private(pending_del, start, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
- clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
- GFP_NOFS);
- if (!test_range_bit(extent_ins, start, end,
- EXTENT_WRITEBACK, 0)) {
- list_add_tail(&extent_op->list, &delete_list);
- nr++;
- } else {
- kfree(extent_op);
-
- ret = get_state_private(&info->extent_ins, start,
- &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
-
- clear_extent_bits(&info->extent_ins, start, end,
- EXTENT_WRITEBACK, GFP_NOFS);
-
- if (extent_op->type == PENDING_BACKREF_UPDATE) {
- list_add_tail(&extent_op->list, &delete_list);
- search = end + 1;
- nr++;
- continue;
- }
-
- mutex_lock(&extent_root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, start,
- end + 1 - start, 0);
- mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
- ret = update_block_group(trans, extent_root, start,
- end + 1 - start, 0, ret > 0);
-
- unlock_extent(extent_ins, start, end, GFP_NOFS);
- BUG_ON(ret);
- kfree(extent_op);
- }
- if (ret)
- err = ret;
-
- search = end + 1;
-
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
- }
+ WARN_ON(num_bytes < root->sectorsize);
- if (nr) {
- ret = free_extents(trans, extent_root, &delete_list);
- BUG_ON(ret);
- }
+ /*
+ * if metadata always pin
+ * if data pin when any transaction has committed this
+ */
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ ref_generation != trans->transid)
+ pin = 1;
- if (all && skipped) {
- INIT_LIST_HEAD(&delete_list);
- search = 0;
- nr = 0;
- goto again;
- }
+ if (ref_generation != trans->transid)
+ pin = 1;
- if (!err)
- finish_current_insert(trans, extent_root, 0);
- return err;
+ return __free_extent(trans, root, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid, pin, pin == 0, refs_to_drop);
}
/*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well. This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
*/
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin)
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr)
{
- struct btrfs_root *extent_root = root->fs_info->extent_root;
- int pending_ret;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct rb_node *node;
int ret;
- WARN_ON(num_bytes < root->sectorsize);
- if (root == extent_root) {
- struct pending_extent_op *extent_op = NULL;
-
- mutex_lock(&root->fs_info->extent_ins_mutex);
- if (test_range_bit(&root->fs_info->extent_ins, bytenr,
- bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
- u64 priv;
- ret = get_state_private(&root->fs_info->extent_ins,
- bytenr, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head)
+ goto out;
- extent_op->del = 1;
- if (extent_op->type == PENDING_EXTENT_INSERT) {
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
- }
+ node = rb_prev(&head->node.rb_node);
+ if (!node)
+ goto out;
- if (extent_op) {
- ref_generation = extent_op->orig_generation;
- parent = extent_op->orig_parent;
- }
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_EXTENT_DELETE;
- extent_op->bytenr = bytenr;
- extent_op->num_bytes = num_bytes;
- extent_op->parent = parent;
- extent_op->orig_parent = parent;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = ref_generation;
- extent_op->level = (int)owner_objectid;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- set_extent_bits(&root->fs_info->pending_del,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->pending_del,
- bytenr, (unsigned long)extent_op);
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
- /* if metadata always pin */
- if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
- mutex_lock(&root->fs_info->pinned_mutex);
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
- mutex_unlock(&root->fs_info->pinned_mutex);
- update_reserved_extents(root, bytenr, num_bytes, 0);
- return 0;
- }
- pin = 1;
- }
+ /* there are still entries for this ref, we can't drop it */
+ if (ref->bytenr == bytenr)
+ goto out;
- /* if data pin when any transaction has committed this */
- if (ref_generation != trans->transid)
- pin = 1;
+ /*
+ * waiting for the lock here would deadlock. If someone else has it
+ * locked they are already in the process of dropping it anyway
+ */
+ if (!mutex_trylock(&head->mutex))
+ goto out;
- ret = __free_extent(trans, root, bytenr, num_bytes, parent,
- root_objectid, ref_generation,
- owner_objectid, pin, pin == 0);
+ /*
+ * at this point we have a head with no other entries. Go
+ * ahead and process it.
+ */
+ head->node.in_tree = 0;
+ rb_erase(&head->node.rb_node, &delayed_refs->root);
- finish_current_insert(trans, root->fs_info->extent_root, 0);
- pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
- return ret ? ret : pending_ret;
+ delayed_refs->num_entries--;
+
+ /*
+ * we don't take a ref on the node because we're removing it from the
+ * tree, so we just steal the ref the tree was holding.
+ */
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+
+ list_del_init(&head->cluster);
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+ &head->node, head->must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
}
int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
{
int ret;
- ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
- root_objectid, ref_generation,
- owner_objectid, pin);
+ /*
+ * tree log blocks never actually go into the extent allocation
+ * tree, just update pinning info and exit early.
+ *
+ * data extents referenced by the tree log do need to have
+ * their reference counts bumped.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+ owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ mutex_lock(&root->fs_info->pinned_mutex);
+
+ /* unlocks the pinned mutex */
+ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+ update_reserved_extents(root, bytenr, num_bytes, 0);
+ ret = 0;
+ } else {
+ ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid,
+ BTRFS_DROP_DELAYED_REF, 1);
+ BUG_ON(ret);
+ ret = check_ref_cleanup(trans, root, bytenr);
+ BUG_ON(ret);
+ }
return ret;
}
@@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 parent,
u64 root_objectid, u64 ref_generation,
- u64 owner, struct btrfs_key *ins)
+ u64 owner, struct btrfs_key *ins,
+ int ref_mod)
{
int ret;
- int pending_ret;
u64 super_used;
u64 root_used;
u64 num_bytes = ins->offset;
@@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, root_used + num_bytes);
spin_unlock(&info->delalloc_lock);
- if (root == extent_root) {
- struct pending_extent_op *extent_op;
-
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_EXTENT_INSERT;
- extent_op->bytenr = ins->objectid;
- extent_op->num_bytes = ins->offset;
- extent_op->parent = parent;
- extent_op->orig_parent = 0;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = 0;
- extent_op->level = (int)owner;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- mutex_lock(&root->fs_info->extent_ins_mutex);
- set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
- ins->objectid + ins->offset - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->extent_ins,
- ins->objectid, (unsigned long)extent_op);
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- goto update_block;
- }
-
memcpy(&keys[0], ins, sizeof(*ins));
keys[1].objectid = ins->objectid;
keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
sizes, 2);
BUG_ON(ret);
extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+ btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_extent_ref);
btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
btrfs_set_ref_objectid(path->nodes[0], ref, owner);
- btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+ btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
btrfs_mark_buffer_dirty(path->nodes[0]);
trans->alloc_exclude_start = 0;
trans->alloc_exclude_nr = 0;
btrfs_free_path(path);
- finish_current_insert(trans, extent_root, 0);
- pending_ret = del_pending_extents(trans, extent_root, 0);
if (ret)
goto out;
- if (pending_ret) {
- ret = pending_ret;
- goto out;
- }
-update_block:
ret = update_block_group(trans, root, ins->objectid,
ins->offset, 1, 0);
if (ret) {
@@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
return 0;
- ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
- ref_generation, owner, ins);
- update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+ ret = btrfs_add_delayed_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ ref_generation, owner,
+ BTRFS_ADD_DELAYED_EXTENT, 0);
+ BUG_ON(ret);
return ret;
}
@@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
put_block_group(block_group);
ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
- ref_generation, owner, ins);
+ ref_generation, owner, ins, 1);
return ret;
}
@@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
u64 search_end, struct btrfs_key *ins, u64 data)
{
int ret;
-
ret = __btrfs_reserve_extent(trans, root, num_bytes,
min_alloc_size, empty_size, hint_byte,
search_end, ins, data);
BUG_ON(ret);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = __btrfs_alloc_reserved_extent(trans, root, parent,
- root_objectid, ref_generation,
- owner_objectid, ins);
+ ret = btrfs_add_delayed_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ ref_generation, owner_objectid,
+ BTRFS_ADD_DELAYED_EXTENT, 0);
BUG_ON(ret);
-
- } else {
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
}
+ update_reserved_extents(root, ins->objectid, ins->offset, 1);
return ret;
}
@@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- ret = __btrfs_free_extent(trans, root, disk_bytenr,
+ ret = btrfs_free_extent(trans, root, disk_bytenr,
btrfs_file_extent_disk_num_bytes(leaf, fi),
leaf->start, leaf_owner, leaf_generation,
key.objectid, 0);
@@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
*/
for (i = 0; i < ref->nritems; i++) {
info = ref->extents + sorted[i].slot;
- ret = __btrfs_free_extent(trans, root, info->bytenr,
+ ret = btrfs_free_extent(trans, root, info->bytenr,
info->num_bytes, ref->bytenr,
ref->owner, ref->generation,
info->objectid, 0);
@@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
return 0;
}
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 start,
u64 len, u32 *refs)
{
int ret;
- ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+ ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
BUG_ON(ret);
#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
* we just decrement it below and don't update any
* of the refs the leaf points to.
*/
- ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ ret = drop_snap_lookup_refcount(trans, root, bytenr,
+ blocksize, &refs);
BUG_ON(ret);
if (refs != 1)
continue;
@@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
*/
for (i = 0; i < refi; i++) {
bytenr = sorted[i].bytenr;
- ret = __btrfs_free_extent(trans, root, bytenr,
+ ret = btrfs_free_extent(trans, root, bytenr,
blocksize, eb->start,
root_owner, root_gen, 0, 1);
BUG_ON(ret);
@@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
- ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+ ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
path->nodes[*level]->len, &refs);
BUG_ON(ret);
if (refs > 1)
@@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
blocksize = btrfs_level_size(root, *level - 1);
- ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ ret = drop_snap_lookup_refcount(trans, root, bytenr,
+ blocksize, &refs);
BUG_ON(ret);
/*
@@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
root_gen = btrfs_header_generation(parent);
path->slots[*level]++;
- ret = __btrfs_free_extent(trans, root, bytenr,
+ ret = btrfs_free_extent(trans, root, bytenr,
blocksize, parent->start,
root_owner, root_gen,
*level - 1, 1);
@@ -4165,7 +3587,7 @@ out:
* cleanup and free the reference on the last node
* we processed
*/
- ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent->start, root_owner, root_gen,
*level, 1);
free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_path *path;
int i;
int orig_level;
+ int update_count;
struct btrfs_root_item *root_item = &root->root_item;
WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
}
}
while (1) {
+ unsigned long update;
wret = walk_down_tree(trans, root, path, &level);
if (wret > 0)
break;
@@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
break;
if (wret < 0)
ret = wret;
- if (trans->transaction->in_commit) {
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing) {
ret = -EAGAIN;
break;
}
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
+ for (update_count = 0; update_count < 16; update_count++) {
+ update = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (update)
+ btrfs_run_delayed_refs(trans, root, update);
+ else
+ break;
+ }
}
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
@@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
root->root_key.objectid,
trans->transid, key.objectid);
BUG_ON(ret);
+
ret = btrfs_free_extent(trans, root,
bytenr, num_bytes, leaf->start,
btrfs_header_owner(leaf),
@@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
ref_path, NULL, NULL);
BUG_ON(ret);
- if (root == root->fs_info->extent_root)
- btrfs_extent_post_op(trans, root);
-
return 0;
}
@@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
goto out;
@@ -6208,6 +5640,9 @@ again:
btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
mutex_unlock(&root->fs_info->cleaner_mutex);
+ trans = btrfs_start_transaction(info->tree_root, 1);
+ btrfs_commit_transaction(trans, info->tree_root);
+
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root;
- root->fs_info->last_trans_new_blockgroup = trans->transid;
+ root->fs_info->last_trans_log_full_commit = trans->transid;
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
@@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
sizeof(cache->item));
BUG_ON(ret);
- finish_current_insert(trans, extent_root, 0);
- ret = del_pending_extents(trans, extent_root, 0);
- BUG_ON(ret);
set_avail_alloc_bits(extent_root->fs_info, type);
return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e606..08085af089e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb)
{
- int set;
unsigned long i;
unsigned long num_pages;
struct page *page;
- u64 start = eb->start;
- u64 end = start + eb->len - 1;
-
- set = clear_extent_dirty(tree, start, end, GFP_NOFS);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!set && !PageDirty(page))
+ if (!PageDirty(page))
continue;
lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
else
set_page_private(page, EXTENT_PAGE_PRIVATE);
- /*
- * if we're on the last page or the first page and the
- * block isn't aligned on a page boundary, do extra checks
- * to make sure we don't clean page that is partially dirty
- */
- if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
- ((i == num_pages - 1) &&
- ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
- start = (u64)page->index << PAGE_CACHE_SHIFT;
- end = start + PAGE_CACHE_SIZE - 1;
- if (test_range_bit(tree, start, end,
- EXTENT_DIRTY, 0)) {
- unlock_page(page);
- continue;
- }
- }
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
{
unsigned long i;
unsigned long num_pages;
+ int was_dirty = 0;
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
- struct page *page = extent_buffer_page(eb, i);
- /* writepage may need to do something special for the
- * first page, we have to make sure page->private is
- * properly set. releasepage may drop page->private
- * on us if the page isn't already dirty.
- */
- lock_page(page);
- if (i == 0) {
- set_page_extent_head(page, eb->len);
- } else if (PagePrivate(page) &&
- page->private != EXTENT_PAGE_PRIVATE) {
- set_page_extent_mapped(page);
- }
+ for (i = 0; i < num_pages; i++)
__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
- set_extent_dirty(tree, page_offset(page),
- page_offset(page) + PAGE_CACHE_SIZE - 1,
- GFP_NOFS);
- unlock_page(page);
- }
- return 0;
+ return was_dirty;
}
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
ret = 0;
goto out;
}
+ if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ ret = 0;
+ goto out;
+ }
/* at this point we can safely release the extent buffer */
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf..5bc20abf3f3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
#define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
/*
* page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd..9b99886562d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
file_key.offset = pos;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
} else {
ins_size = csum_size;
}
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
+ path->leave_spinning = 0;
if (ret < 0)
goto fail_unlock;
if (ret != 0) {
@@ -776,7 +780,6 @@ found:
item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
btrfs_item_size_nr(leaf, path->slots[0]));
eb_token = NULL;
- cond_resched();
next_sector:
if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
eb_token = NULL;
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- cond_resched();
if (total_bytes < sums->len) {
btrfs_release_path(root, path);
+ cond_resched();
goto again;
}
out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b..9c9fb46ccd0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
btrfs_release_path(root, path);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*extent));
BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
ram_bytes);
btrfs_set_file_extent_type(leaf, extent, found_type);
+ btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_set_lock_blocking(path->nodes[0]);
if (disk_bytenr != 0) {
ret = btrfs_update_extent_ref(trans, root,
- disk_bytenr, orig_parent,
+ disk_bytenr,
+ le64_to_cpu(old.disk_num_bytes),
+ orig_parent,
leaf->start,
root->root_key.objectid,
trans->transid, ins.objectid);
BUG_ON(ret);
}
+ path->leave_spinning = 0;
btrfs_release_path(root, path);
if (disk_bytenr != 0)
inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
btrfs_set_file_extent_other_encoding(leaf, fi, 0);
if (orig_parent != leaf->start) {
- ret = btrfs_update_extent_ref(trans, root, bytenr,
+ ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
orig_parent, leaf->start,
root->root_key.objectid,
trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
page_cache_release(pinned[1]);
*ppos = pos;
+ /*
+ * we want to make sure fsync finds this change
+ * but we haven't joined a transaction running right now.
+ *
+ * Later on, someone is sure to update the inode and get the
+ * real transid recorded.
+ *
+ * We set last_trans now to the fs_info generation + 1,
+ * this will either be one more than the running transaction
+ * or the generation used for the next transaction if there isn't
+ * one running right now.
+ */
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
if (num_written > 0 && will_write) {
struct btrfs_trans_handle *trans;
@@ -1167,8 +1187,11 @@ out_nolock:
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
if (ret == 0) {
- btrfs_sync_log(trans, root);
- btrfs_end_transaction(trans, root);
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ btrfs_end_transaction(trans, root);
+ else
+ btrfs_commit_transaction(trans, root);
} else {
btrfs_commit_transaction(trans, root);
}
@@ -1185,6 +1208,18 @@ out_nolock:
int btrfs_release_file(struct inode *inode, struct file *filp)
{
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (BTRFS_I(inode)->ordered_data_close) {
+ BTRFS_I(inode)->ordered_data_close = 0;
+ btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(inode->i_mapping);
+ }
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
} else {
- btrfs_sync_log(trans, root);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a..6b627c61180 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22..06d8db5afb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
btrfs_set_trans_block_group(trans, inode);
key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_CACHE_SIZE);
- kaddr = kmap(cpage);
+ kaddr = kmap_atomic(cpage, KM_USER0);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap(cpage);
+ kunmap_atomic(kaddr, KM_USER0);
i++;
ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end,
size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
int limit = 10 * 1024 * 1042;
- if (!btrfs_test_opt(root, COMPRESS)) {
- return cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
- }
-
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
EXTENT_DELALLOC, 1, 0, GFP_NOFS);
while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+ struct page *locked_page,
u64 start, u64 end, int *page_started, int force,
unsigned long *nr_written)
{
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_test_flag(inode, NODATACOW))
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
else if (btrfs_test_flag(inode, PREALLOC))
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
+ else if (!btrfs_test_opt(root, COMPRESS))
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
else
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
-
return ret;
}
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_drop_extents(trans, root, inode, file_pos,
file_pos + num_bytes, file_pos, &hint);
BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, fi, compression);
btrfs_set_file_extent_encryption(leaf, fi, encryption);
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_set_lock_blocking(leaf);
+
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
root->root_key.objectid,
trans->transid, inode->i_ino, &ins);
BUG_ON(ret);
-
btrfs_free_path(path);
+
return 0;
}
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction. It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_ordered_sum *sum;
+ u64 bytenr;
+
+ sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+ list);
+ bytenr = sum->sums[0].bytenr;
+
+ /*
+ * we don't care about the results, the point of this search is
+ * just to get the btree leaves into ram
+ */
+ btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- struct btrfs_ordered_extent *ordered_extent;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_path *path;
int compressed = 0;
int ret;
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
if (!ret)
return 0;
+ /*
+ * before we join the transaction, try to do some of our IO.
+ * This will limit the amount of IO that we have to do with
+ * the transaction running. We're unlikely to need to do any
+ * IO if the file extents are new, the disk_i_size checks
+ * covers the most common case.
+ */
+ if (start < BTRFS_I(inode)->disk_i_size) {
+ path = btrfs_alloc_path();
+ if (path) {
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ inode->i_ino,
+ start, 0);
+ ordered_extent = btrfs_lookup_ordered_extent(inode,
+ start);
+ if (!list_empty(&ordered_extent->list)) {
+ btrfs_release_path(root, path);
+ reada_csum(root, path, ordered_extent);
+ }
+ btrfs_free_path(path);
+ }
+ }
+
trans = btrfs_join_transaction(root, 1);
- ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered_extent)
+ ordered_extent = btrfs_lookup_ordered_extent(inode, start);
BUG_ON(!ordered_extent);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_lookup_inode(trans, root, path,
&BTRFS_I(inode)->location, 1);
if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
+ path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
name, name_len, -1);
if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir->i_ino);
BUG_ON(ret != 0 && ret != -ENOENT);
- if (ret != -ENOENT)
- BTRFS_I(dir)->log_dirty_trans = trans->transid;
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
+
+ btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
key.type = (u8)-1;
search_again:
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto error;
@@ -2644,6 +2702,7 @@ delete:
break;
}
if (found_extent) {
+ btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes,
leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
- err = btrfs_cont_expand(inode, attr->ia_size);
- if (err)
- return err;
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ if (attr->ia_size > inode->i_size) {
+ err = btrfs_cont_expand(inode, attr->ia_size);
+ if (err)
+ return err;
+ } else if (inode->i_size > 0 &&
+ attr->ia_size == 0) {
+
+ /* we're truncating a file that used to have good
+ * data down to zero. Make sure it gets into
+ * the ordered flush list so that any new writes
+ * get down to disk quickly.
+ */
+ BTRFS_I(inode)->ordered_data_close = 1;
+ }
}
err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->disk_i_size = 0;
bi->flags = 0;
bi->index_cnt = (u64)-1;
- bi->log_dirty_trans = 0;
+ bi->last_unlink_trans = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_tree,
inode->i_mapping, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
inode->i_mapping, GFP_NOFS);
INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+ INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
mutex_init(&BTRFS_I(inode)->extent_mutex);
mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
sizes[1] = name_len + sizeof(*ref);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
if (ret != 0)
goto fail;
@@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
nr = trans->blocks_used;
+
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
if (drop_inode) {
@@ -4292,8 +4365,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*/
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
struct inode *inode = fdentry(vma->vm_file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4380,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
u64 page_end;
ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret)
+ if (ret) {
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else /* -ENOSPC, -EIO, etc */
+ ret = VM_FAULT_SIGBUS;
goto out;
+ }
- ret = -EINVAL;
+ ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
size = i_size_read(inode);
@@ -4357,6 +4436,8 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
@@ -4382,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
trans = btrfs_start_transaction(root, 1);
+
+ /*
+ * setattr is responsible for setting the ordered_data_close flag,
+ * but that is only tested during the last file release. That
+ * could happen well after the next commit, leaving a great big
+ * window where new writes may get lost if someone chooses to write
+ * to this file after truncating to zero
+ *
+ * The inode doesn't have any dirty data here, and so if we commit
+ * this is a noop. If someone immediately starts writing to the inode
+ * it is very likely we'll catch some of their writes in this
+ * transaction, and the commit will find this file on the ordered
+ * data list with good things to send down.
+ *
+ * This is a best effort solution, there is still a window where
+ * using truncate to replace the contents of the file will
+ * end up with a zero length file after a crash.
+ */
+ if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+ btrfs_add_ordered_operation(trans, root, inode);
+
btrfs_set_trans_block_group(trans, inode);
btrfs_i_size_write(inode, inode->i_size);
@@ -4458,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_acl = BTRFS_ACL_NOT_CACHED;
ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->ordered_operations);
return &ei->vfs_inode;
}
void btrfs_destroy_inode(struct inode *inode)
{
struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
@@ -4474,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode)
BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
posix_acl_release(BTRFS_I(inode)->i_default_acl);
- spin_lock(&BTRFS_I(inode)->root->list_lock);
+ /*
+ * Make sure we're properly removed from the ordered operation
+ * lists.
+ */
+ smp_mb();
+ if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_lock(&root->list_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
" list\n", inode->i_ino);
dump_stack();
}
- spin_unlock(&BTRFS_I(inode)->root->list_lock);
+ spin_unlock(&root->list_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4605,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_unlock;
+ /*
+ * we're using rename to replace one file with another.
+ * and the replacement file is large. Start IO on it now so
+ * we don't add too much work to the end of the transaction
+ */
+ if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+ new_inode->i_size &&
+ old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(old_inode->i_mapping);
+
trans = btrfs_start_transaction(root, 1);
+ /*
+ * make sure the inode gets flushed if it is replacing
+ * something.
+ */
+ if (new_inode && new_inode->i_size &&
+ old_inode && S_ISREG(old_inode->i_mode)) {
+ btrfs_add_ordered_operation(trans, root, old_inode);
+ }
+
+ /*
+ * this is an ugly little race, but the rename is required to make
+ * sure that if we crash, the inode is either at the old name
+ * or the new one. pinning the log transaction lets us make sure
+ * we don't allow a log commit to come in after we unlink the
+ * name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+
btrfs_set_trans_block_group(trans, new_dir);
btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -4645,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_fail;
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
out_fail:
+
+ /* this btrfs_end_log_trans just allows the current
+ * log-sub transaction to complete
+ */
+ btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
out_unlock:
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c..7594bec1be1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
goto out_dput;
if (!IS_POSIXACL(parent->dentry->d_inode))
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
error = mnt_want_write(parent->mnt);
if (error)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a..a5310c0f41e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
static int btrfs_spin_on_block(struct extent_buffer *eb)
{
int i;
+
for (i = 0; i < 512; i++) {
- cpu_relax();
if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
return 1;
if (need_resched())
break;
+ cpu_relax();
}
return 0;
}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
{
int i;
- spin_nested(eb);
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 1;
- spin_unlock(&eb->lock);
-
+ if (btrfs_spin_on_block(eb)) {
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+ }
/* spin for a bit on the BLOCKING flag */
for (i = 0; i < 2; i++) {
+ cpu_relax();
if (!btrfs_spin_on_block(eb))
break;
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
DEFINE_WAIT(wait);
wait.func = btrfs_wake_function;
+ if (!btrfs_spin_on_block(eb))
+ goto sleep;
+
while(1) {
spin_nested(eb);
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
* spin for a bit, and if the blocking flag goes away,
* loop around
*/
+ cpu_relax();
if (btrfs_spin_on_block(eb))
continue;
-
+sleep:
prepare_to_wait_exclusive(&eb->lock_wq, &wait,
TASK_UNINTERRUPTIBLE);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0..53c87b197d7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+
+ /*
+ * we have no more ordered extents for this inode and
+ * no dirty pages. We can safely remove it from the
+ * list of ordered extents
+ */
+ if (RB_EMPTY_ROOT(&tree->tree) &&
+ !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ }
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
}
/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list. These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io. When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct inode *inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ inode = &btrfs_inode->vfs_inode;
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ /*
+ * the inode may be getting freed (in sys_unlink path).
+ */
+ inode = igrab(inode);
+
+ if (!wait && inode) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (inode) {
+ if (wait)
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ else
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+ }
+
+ cond_resched();
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+ if (wait && !list_empty(&root->fs_info->ordered_operations))
+ goto again;
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+}
+
+/*
* Used to start IO or wait for a given ordered extent to finish.
*
* If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
return ret;
}
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ u64 last_mod;
+
+ last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+ /*
+ * if this file hasn't been changed since the last transaction
+ * commit, we can safely return without doing anything
+ */
+ if (last_mod < root->fs_info->last_trans_committed)
+ return 0;
+
+ /*
+ * the transaction is already committing. Just start the IO and
+ * don't bother with all of this list nonsense
+ */
+ if (trans && root->fs_info->running_transaction->blocked) {
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ return 0;
+ }
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d..3d31c8827b0 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4..664782c6a2d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root)
cur_trans->use_count = 1;
cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
+
+ cur_trans->delayed_refs.root.rb_node = NULL;
+ cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->block_group = 0;
h->alloc_exclude_nr = 0;
h->alloc_exclude_start = 0;
+ h->delayed_ref_updates = 0;
+
root->fs_info->running_transaction->use_count++;
mutex_unlock(&root->fs_info->trans_mutex);
return h;
@@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
if (!root->fs_info->open_ioctl_trans)
wait_current_trans(root);
mutex_unlock(&root->fs_info->trans_mutex);
-
throttle_on_drops(root);
}
@@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *info = root->fs_info;
+ int count = 0;
+
+ while (count < 4) {
+ unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (cur &&
+ trans->transaction->delayed_refs.num_heads_ready > 64) {
+ trans->delayed_ref_updates = 0;
+
+ /*
+ * do a full flush if the transaction is trying
+ * to close
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ cur = 0;
+ btrfs_run_delayed_refs(trans, root, cur);
+ } else {
+ break;
+ }
+ count++;
+ }
mutex_lock(&info->trans_mutex);
cur_trans = info->running_transaction;
@@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
u64 old_root_bytenr;
struct btrfs_root *tree_root = root->fs_info->tree_root;
- btrfs_extent_post_op(trans, root);
btrfs_write_dirty_block_groups(trans, root);
- btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
btrfs_header_level(root->node));
btrfs_set_root_generation(&root->root_item, trans->transid);
- btrfs_extent_post_op(trans, root);
-
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
&root->root_item);
BUG_ON(ret);
btrfs_write_dirty_block_groups(trans, root);
- btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
}
return 0;
}
@@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *next;
struct extent_buffer *eb;
+ int ret;
- btrfs_extent_post_op(trans, fs_info->tree_root);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
eb = btrfs_lock_root_node(fs_info->tree_root);
- btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+ btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- btrfs_extent_post_op(trans, fs_info->tree_root);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
root = list_entry(next, struct btrfs_root, dirty_list);
update_cowonly_root(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
}
return 0;
}
@@ -635,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
}
/*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+ DEFINE_WAIT(wait);
+
+ mutex_lock(&info->trans_mutex);
+ while (info->running_transaction &&
+ info->running_transaction->delayed_refs.flushing) {
+ prepare_to_wait(&info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&info->trans_mutex);
+ schedule();
+ mutex_lock(&info->trans_mutex);
+ finish_wait(&info->transaction_wait, &wait);
+ }
+ mutex_unlock(&info->trans_mutex);
+ return 0;
+}
+
+/*
* Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
* all of them
*/
@@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
atomic_inc(&root->fs_info->throttles);
while (1) {
+ /*
+ * we don't want to jump in and create a bunch of
+ * delayed refs if the transaction is starting to close
+ */
+ wait_transaction_pre_flush(tree_root->fs_info);
trans = btrfs_start_transaction(tree_root, 1);
+
+ /*
+ * we've joined a transaction, make sure it isn't
+ * closing right now
+ */
+ if (trans->transaction->delayed_refs.flushing) {
+ btrfs_end_transaction(trans, tree_root);
+ continue;
+ }
+
mutex_lock(&root->fs_info->drop_mutex);
ret = btrfs_drop_snapshot(trans, dirty->root);
if (ret != -EAGAIN)
@@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
- btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+ btrfs_cow_block(trans, root, old, NULL, 0, &old);
btrfs_copy_root(trans, root, old, &tmp, objectid);
btrfs_tree_unlock(old);
@@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
+ int should_grow = 0;
+ unsigned long now = get_seconds();
+
+ btrfs_run_ordered_operations(root, 0);
+
+ /* make a pass through all the delayed refs we have so far
+ * any runnings procs may add more while we are here
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
+
+ cur_trans = trans->transaction;
+ /*
+ * set the flushing flag so procs in this transaction have to
+ * start sending their work down.
+ */
+ cur_trans->delayed_refs.flushing = 1;
+
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
- INIT_LIST_HEAD(&dirty_fs_roots);
mutex_lock(&root->fs_info->trans_mutex);
- if (trans->transaction->in_commit) {
- cur_trans = trans->transaction;
- trans->transaction->use_count++;
+ INIT_LIST_HEAD(&dirty_fs_roots);
+ if (cur_trans->in_commit) {
+ cur_trans->use_count++;
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
@@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
- cur_trans = trans->transaction;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
@@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
}
+ if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+ should_grow = 1;
+
do {
int snap_pending = 0;
joined = cur_trans->num_joined;
@@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (cur_trans->num_writers > 1)
timeout = MAX_SCHEDULE_TIMEOUT;
- else
+ else if (should_grow)
timeout = 1;
mutex_unlock(&root->fs_info->trans_mutex);
@@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
- schedule_timeout(timeout);
+ /*
+ * rename don't use btrfs_join_transaction, so, once we
+ * set the transaction to blocked above, we aren't going
+ * to get any new ordered operations. We can safely run
+ * it here and no for sure that nothing new will be added
+ * to the list
+ */
+ btrfs_run_ordered_operations(root, 1);
+
+ smp_mb();
+ if (cur_trans->num_writers > 1 || should_grow)
+ schedule_timeout(timeout);
mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
} while (cur_trans->num_writers > 1 ||
- (cur_trans->num_joined != joined));
+ (should_grow && cur_trans->num_joined != joined));
ret = create_pending_snapshots(trans, root->fs_info);
BUG_ON(ret);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
+
WARN_ON(cur_trans != trans->transaction);
/* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_copy_pinned(root, pinned_copy);
trans->transaction->blocked = 0;
+
wake_up(&root->fs_info->transaction_throttle);
wake_up(&root->fs_info->transaction_wait);
@@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->trans_mutex);
cur_trans->commit_done = 1;
+
root->fs_info->last_trans_committed = cur_trans->transid;
wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f88..94f5bde2b58 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
#include "btrfs_inode.h"
+#include "delayed-ref.h"
struct btrfs_transaction {
u64 transid;
+ /*
+ * total writers in this transaction, it must be zero before the
+ * transaction can end
+ */
unsigned long num_writers;
+
unsigned long num_joined;
int in_commit;
int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
+ struct btrfs_delayed_ref_root delayed_refs;
};
struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
u64 block_group;
u64 alloc_exclude_start;
u64 alloc_exclude_nr;
+ unsigned long delayed_ref_updates;
};
struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570..b10eacdb162 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
}
btrfs_release_path(root, path);
- if (is_extent)
- btrfs_extent_post_op(trans, root);
out:
if (path)
btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60f..fc9b87a7975 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
#define LOG_INODE_EXISTS 1
/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2). After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ * 2a is actually the more important variant. With the extra logging
+ * a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir. After a crash the rm -rf must
+ * be replayed. This must be able to recurse down the entire
+ * directory tree. The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
* stages for the tree walking. The first
* stage (0) is to only pin down the blocks we find
* the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
#define LOG_WALK_REPLAY_INODES 1
#define LOG_WALK_REPLAY_ALL 2
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all);
/*
* tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
}
/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
mutex_lock(&log->fs_info->pinned_mutex);
btrfs_update_pinned_extents(log->fs_info->extent_root,
eb->start, eb->len, 1);
- mutex_unlock(&log->fs_info->pinned_mutex);
}
if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
ret = link_to_fixup_dir(trans, root, path, location.objectid);
BUG_ON(ret);
+
ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
BUG_ON(ret);
kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
victim_name_len)) {
btrfs_inc_nlink(inode);
btrfs_release_path(root, path);
+
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
key.offset--;
btrfs_release_path(root, path);
}
- btrfs_free_path(path);
+ btrfs_release_path(root, path);
if (nlink != inode->i_nlink) {
inode->i_nlink = nlink;
btrfs_update_inode(trans, root, inode);
}
BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ inode->i_ino, 1);
+ BUG_ON(ret);
+ }
+ btrfs_free_path(path);
+
return 0;
}
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
iput(inode);
- if (key.offset == 0)
- break;
- key.offset--;
+ /*
+ * fixup on a directory may create new entries,
+ * make sure we always look for the highset possible
+ * offset
+ */
+ key.offset = (u64)-1;
}
btrfs_release_path(root, path);
return 0;
@@ -1313,11 +1387,11 @@ again:
read_extent_buffer(eb, name, (unsigned long)(di + 1),
name_len);
log_di = NULL;
- if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
log_di = btrfs_lookup_dir_item(trans, log, log_path,
dir_key->objectid,
name, name_len, 0);
- } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
log_di = btrfs_lookup_dir_index_item(trans, log,
log_path,
dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid)
+ u64 dirid, int del_all)
{
u64 range_start;
u64 range_end;
@@ -1408,10 +1482,14 @@ again:
range_start = 0;
range_end = 0;
while (1) {
- ret = find_dir_range(log, path, dirid, key_type,
- &range_start, &range_end);
- if (ret != 0)
- break;
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+ ret = find_dir_range(log, path, dirid, key_type,
+ &range_start, &range_end);
+ if (ret != 0)
+ break;
+ }
dir_key.offset = range_start;
while (1) {
@@ -1437,7 +1515,8 @@ again:
break;
ret = check_item_in_log(trans, root, log, path,
- log_path, dir, &found_key);
+ log_path, dir,
+ &found_key);
BUG_ON(ret);
if (found_key.offset == (u64)-1)
break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid);
+ root, log, path, key.objectid, 0);
BUG_ON(ret);
}
ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root, inode, inode->i_size,
BTRFS_EXTENT_DATA_KEY);
BUG_ON(ret);
+
+ /* if the nlink count is zero here, the iput
+ * will free the inode. We bump it to make
+ * sure it doesn't get freed until the link
+ * count fixup is done
+ */
+ if (inode->i_nlink == 0) {
+ btrfs_inc_nlink(inode);
+ btrfs_update_inode(wc->trans,
+ root, inode);
+ }
iput(inode);
}
ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->log_transid < transid + 2 &&
+
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]))
schedule();
+
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
} while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
return 0;
}
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
DEFINE_WAIT(wait);
while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (atomic_read(&root->log_writers))
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(root, root->log_transid);
+ wait_log_commit(trans, root, root->log_transid);
mutex_unlock(&root->log_mutex);
return 0;
}
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(root, root->log_transid - 1);
+ wait_log_commit(trans, root, root->log_transid - 1);
while (1) {
unsigned long batch = root->log_batch;
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
- wait_for_writer(root);
+
+ wait_for_writer(trans, root);
if (batch == root->log_batch)
break;
}
+ /* bail out if we need to do a full commit */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ ret = -EAGAIN;
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
BUG_ON(ret);
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
- wait_log_commit(log_root_tree, log_root_tree->log_transid);
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
atomic_set(&log_root_tree->log_commit[index2], 1);
- if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
- wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid - 1);
+ }
+
+ wait_for_writer(trans, log_root_tree);
- wait_for_writer(log_root_tree);
+ /*
+ * now that we've moved on to the tree of log tree roots,
+ * check the full commit flag again
+ */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out_wake_log_root;
+ }
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* in and cause problems either.
*/
write_ctree_super(trans, root->fs_info->tree_root, 2);
+ ret = 0;
+out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
return 0;
}
-/* * free all the extents used by the tree log. This should be called
+/*
+ * free all the extents used by the tree log. This should be called
* at commit time of the full transaction
*/
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
mutex_unlock(&BTRFS_I(dir)->log_mutex);
- end_log_trans(root);
+ btrfs_end_log_trans(root);
return 0;
}
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
- end_log_trans(root);
+ btrfs_end_log_trans(root);
return ret;
}
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
*
* This handles both files and directories.
*/
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only)
{
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
min_key.offset = 0;
max_key.objectid = inode->i_ino;
+
+ /* today the code can only do partial logging of directories */
+ if (!S_ISDIR(inode->i_mode))
+ inode_only = LOG_INODE_ALL;
+
if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /*
- * if this inode has already been logged and we're in inode_only
- * mode, we don't want to delete the things that have already
- * been written to the log.
- *
- * But, if the inode has been through an inode_only log,
- * the logged_trans field is not set. This allows us to catch
- * any new names for this inode in the backrefs by logging it
- * again
- */
- if (inode_only == LOG_INODE_EXISTS &&
- BTRFS_I(inode)->logged_trans == trans->transid) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- goto out;
- }
mutex_lock(&BTRFS_I(inode)->log_mutex);
/*
@@ -2693,7 +2809,6 @@ next_slot:
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
- BTRFS_I(inode)->log_dirty_trans = 0;
ret = log_directory_changes(trans, root, inode, path, dst_path);
BUG_ON(ret);
}
@@ -2702,19 +2817,69 @@ next_slot:
btrfs_free_path(path);
btrfs_free_path(dst_path);
-out:
return 0;
}
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged. Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct dentry *parent,
+ struct super_block *sb,
+ u64 last_committed)
{
- int ret;
+ int ret = 0;
+ struct btrfs_root *root;
- start_log_trans(trans, root);
- ret = __btrfs_log_inode(trans, root, inode, inode_only);
- end_log_trans(root);
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto out;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ goto out;
+ inode = parent->d_inode;
+ }
+
+ while (1) {
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ smp_mb();
+
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ root = BTRFS_I(inode)->root;
+
+ /*
+ * make sure any commits to the log are forced
+ * to be full commits
+ */
+ root->fs_info->last_trans_log_full_commit =
+ trans->transid;
+ ret = 1;
+ break;
+ }
+
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+ if (parent == sb->s_root)
+ break;
+
+ parent = parent->d_parent;
+ inode = parent->d_inode;
+
+ }
+out:
return ret;
}
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
* only logging is done of any parent directories that are older than
* the last committed transaction
*/
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only)
{
- int inode_only = LOG_INODE_ALL;
+ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
- int ret;
+ int ret = 0;
+ u64 last_committed = root->fs_info->last_trans_committed;
+
+ sb = inode->i_sb;
+
+ if (root->fs_info->last_trans_log_full_commit >
+ root->fs_info->last_trans_committed) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ ret = check_parent_dirs_for_sync(trans, inode, parent,
+ sb, last_committed);
+ if (ret)
+ goto end_no_trans;
start_log_trans(trans, root);
- sb = dentry->d_inode->i_sb;
- while (1) {
- ret = __btrfs_log_inode(trans, root, dentry->d_inode,
- inode_only);
- BUG_ON(ret);
- inode_only = LOG_INODE_EXISTS;
- dentry = dentry->d_parent;
- if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto no_parent;
+
+ inode_only = LOG_INODE_EXISTS;
+ while (1) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
- if (BTRFS_I(dentry->d_inode)->generation <=
- root->fs_info->last_trans_committed)
+ inode = parent->d_inode;
+ if (BTRFS_I(inode)->generation >
+ root->fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+ }
+ if (parent == sb->s_root)
break;
+
+ parent = parent->d_parent;
}
- end_log_trans(root);
- return 0;
+no_parent:
+ ret = 0;
+ btrfs_end_log_trans(root);
+end_no_trans:
+ return ret;
}
/*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry)
{
- u64 gen;
- gen = root->fs_info->last_trans_new_blockgroup;
- if (gen > root->fs_info->last_trans_committed)
- return 1;
- else
- return btrfs_log_dentry(trans, root, dentry);
+ return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+ dentry->d_parent, 0);
}
/*
@@ -2884,3 +3079,94 @@ again:
kfree(log_root_tree);
return 0;
}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename)
+{
+ /*
+ * when we're logging a file, if it hasn't been renamed
+ * or unlinked, and its inode is fully committed on disk,
+ * we don't have to worry about walking up the directory chain
+ * to log its parents.
+ *
+ * So, we use the last_unlink_trans field to put this transid
+ * into the file. When the file is logged we check it and
+ * don't log the parents if the file is fully on disk.
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this directory was already logged any new
+ * names for this file/dir will get recorded
+ */
+ smp_mb();
+ if (BTRFS_I(dir)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * if the inode we're about to unlink was logged,
+ * the log will be properly updated for any new names
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * when renaming files across directories, if the directory
+ * there we're unlinking from gets fsync'd later on, there's
+ * no way to find the destination directory later and fsync it
+ * properly. So, we have to be conservative and force commits
+ * so the new name gets discovered.
+ */
+ if (for_rename)
+ goto record;
+
+ /* we can safely do the unlink without any special recording */
+ return;
+
+record:
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent)
+{
+ struct btrfs_root * root = BTRFS_I(inode)->root;
+
+ /*
+ * this will force the logging code to walk the dentry chain
+ * up for the file
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this inode hasn't been logged and directory we're renaming it
+ * from hasn't been logged, we don't need to log it
+ */
+ if (BTRFS_I(inode)->logged_trans <=
+ root->fs_info->last_trans_committed &&
+ (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+ root->fs_info->last_trans_committed))
+ return 0;
+
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed0..d09c7609e16 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent);
#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97c..c2fa1be4923 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
head = page_buffers(page);
bh = head;
do {
- if (bh->b_blocknr == block) {
+ if (!buffer_mapped(bh))
+ all_mapped = 0;
+ else if (bh->b_blocknr == block) {
ret = bh;
get_bh(bh);
goto out_unlock;
}
- if (!buffer_mapped(bh))
- all_mapped = 0;
bh = bh->b_this_page;
} while (bh != head);
@@ -290,7 +290,7 @@ static void free_more_memory(void)
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS);
+ GFP_NOFS, NULL);
}
}
@@ -547,6 +547,39 @@ repeat:
return err;
}
+void do_thaw_all(unsigned long unused)
+{
+ struct super_block *sb;
+ char b[BDEVNAME_SIZE];
+
+ spin_lock(&sb_lock);
+restart:
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_read(&sb->s_umount);
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+ printk(KERN_WARNING "Emergency Thaw on %s\n",
+ bdevname(sb->s_bdev, b));
+ up_read(&sb->s_umount);
+ spin_lock(&sb_lock);
+ if (__put_super_and_need_restart(sb))
+ goto restart;
+ }
+ spin_unlock(&sb_lock);
+ printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+ pdflush_operation(do_thaw_all, 0);
+}
+
/**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
@@ -621,14 +654,7 @@ static void __set_page_dirty(struct page *page,
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
-
- if (mapping_cap_account_dirty(mapping)) {
- __inc_zone_page_state(page, NR_FILE_DIRTY);
- __inc_bdi_stat(mapping->backing_dev_info,
- BDI_RECLAIMABLE);
- task_dirty_inc(current);
- task_io_account_write(PAGE_CACHE_SIZE);
- }
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
@@ -2320,13 +2346,14 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
* unlock the page.
*/
int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
+ struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
unsigned long end;
loff_t size;
- int ret = -EINVAL;
+ int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
lock_page(page);
size = i_size_read(inode);
@@ -2346,6 +2373,13 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
if (!ret)
ret = block_commit_write(page, 0, end);
+ if (unlikely(ret)) {
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else /* -ENOSPC, -EIO, etc */
+ ret = VM_FAULT_SIGBUS;
+ }
+
out_unlock:
unlock_page(page);
return ret;
@@ -3281,7 +3315,6 @@ EXPORT_SYMBOL(cont_write_begin);
EXPORT_SYMBOL(end_buffer_read_sync);
EXPORT_SYMBOL(end_buffer_write_sync);
EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
EXPORT_SYMBOL(generic_block_bmap);
EXPORT_SYMBOL(generic_cont_expand_simple);
EXPORT_SYMBOL(init_buffer);
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644
index 00000000000..80e9c6167f0
--- /dev/null
+++ b/fs/cachefiles/Kconfig
@@ -0,0 +1,39 @@
+
+config CACHEFILES
+ tristate "Filesystem caching on files"
+ depends on FSCACHE && BLOCK
+ help
+ This permits use of a mounted filesystem as a cache for other
+ filesystems - primarily networking filesystems - thus allowing fast
+ local disk to enhance the speed of slower devices.
+
+ See Documentation/filesystems/caching/cachefiles.txt for more
+ information.
+
+config CACHEFILES_DEBUG
+ bool "Debug CacheFiles"
+ depends on CACHEFILES
+ help
+ This permits debugging to be dynamically enabled in the filesystem
+ caching on files module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+ by including a debugging specifier in /etc/cachefilesd.conf.
+
+config CACHEFILES_HISTOGRAM
+ bool "Gather latency information on CacheFiles"
+ depends on CACHEFILES && PROC_FS
+ help
+
+ This option causes latency information to be gathered on CacheFiles
+ operation and exported through file:
+
+ /proc/fs/cachefiles/histogram
+
+ The generation of this histogram adds a certain amount of overhead to
+ execution as there are a number of points at which data is gathered,
+ and on a multi-CPU system these may be on cachelines that keep
+ bouncing between CPUs. On the other hand, the histogram may be
+ useful for debugging purposes. Saying 'N' here is recommended.
+
+ See Documentation/filesystems/caching/cachefiles.txt for more
+ information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644
index 00000000000..32cbab0ffce
--- /dev/null
+++ b/fs/cachefiles/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+
+cachefiles-y := \
+ bind.o \
+ daemon.o \
+ interface.o \
+ key.o \
+ main.o \
+ namei.o \
+ rdwr.o \
+ security.o \
+ xattr.o
+
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644
index 00000000000..3797e0077b3
--- /dev/null
+++ b/fs/cachefiles/bind.c
@@ -0,0 +1,286 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+ _enter("{%u,%u,%u,%u,%u,%u},%s",
+ cache->frun_percent,
+ cache->fcull_percent,
+ cache->fstop_percent,
+ cache->brun_percent,
+ cache->bcull_percent,
+ cache->bstop_percent,
+ args);
+
+ /* start by checking things over */
+ ASSERT(cache->fstop_percent >= 0 &&
+ cache->fstop_percent < cache->fcull_percent &&
+ cache->fcull_percent < cache->frun_percent &&
+ cache->frun_percent < 100);
+
+ ASSERT(cache->bstop_percent >= 0 &&
+ cache->bstop_percent < cache->bcull_percent &&
+ cache->bcull_percent < cache->brun_percent &&
+ cache->brun_percent < 100);
+
+ if (*args) {
+ kerror("'bind' command doesn't take an argument");
+ return -EINVAL;
+ }
+
+ if (!cache->rootdirname) {
+ kerror("No cache directory specified");
+ return -EINVAL;
+ }
+
+ /* don't permit already bound caches to be re-bound */
+ if (test_bit(CACHEFILES_READY, &cache->flags)) {
+ kerror("Cache already bound");
+ return -EBUSY;
+ }
+
+ /* make sure we have copies of the tag and dirname strings */
+ if (!cache->tag) {
+ /* the tag string is released by the fops->release()
+ * function, so we don't release it on error here */
+ cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+ if (!cache->tag)
+ return -ENOMEM;
+ }
+
+ /* add the cache */
+ return cachefiles_daemon_add_cache(cache);
+}
+
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+ struct cachefiles_object *fsdef;
+ struct nameidata nd;
+ struct kstatfs stats;
+ struct dentry *graveyard, *cachedir, *root;
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter("");
+
+ /* we want to work under the module's security ID */
+ ret = cachefiles_get_security_ID(cache);
+ if (ret < 0)
+ return ret;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+
+ /* allocate the root index object */
+ ret = -ENOMEM;
+
+ fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+ if (!fsdef)
+ goto error_root_object;
+
+ ASSERTCMP(fsdef->backer, ==, NULL);
+
+ atomic_set(&fsdef->usage, 1);
+ fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+
+ _debug("- fsdef %p", fsdef);
+
+ /* look up the directory at the root of the cache */
+ memset(&nd, 0, sizeof(nd));
+
+ ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
+ if (ret < 0)
+ goto error_open_root;
+
+ cache->mnt = mntget(nd.path.mnt);
+ root = dget(nd.path.dentry);
+ path_put(&nd.path);
+
+ /* check parameters */
+ ret = -EOPNOTSUPP;
+ if (!root->d_inode ||
+ !root->d_inode->i_op ||
+ !root->d_inode->i_op->lookup ||
+ !root->d_inode->i_op->mkdir ||
+ !root->d_inode->i_op->setxattr ||
+ !root->d_inode->i_op->getxattr ||
+ !root->d_sb ||
+ !root->d_sb->s_op ||
+ !root->d_sb->s_op->statfs ||
+ !root->d_sb->s_op->sync_fs)
+ goto error_unsupported;
+
+ ret = -EROFS;
+ if (root->d_sb->s_flags & MS_RDONLY)
+ goto error_unsupported;
+
+ /* determine the security of the on-disk cache as this governs
+ * security ID of files we create */
+ ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+ if (ret < 0)
+ goto error_unsupported;
+
+ /* get the cache size and blocksize */
+ ret = vfs_statfs(root, &stats);
+ if (ret < 0)
+ goto error_unsupported;
+
+ ret = -ERANGE;
+ if (stats.f_bsize <= 0)
+ goto error_unsupported;
+
+ ret = -EOPNOTSUPP;
+ if (stats.f_bsize > PAGE_SIZE)
+ goto error_unsupported;
+
+ cache->bsize = stats.f_bsize;
+ cache->bshift = 0;
+ if (stats.f_bsize < PAGE_SIZE)
+ cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+
+ _debug("blksize %u (shift %u)",
+ cache->bsize, cache->bshift);
+
+ _debug("size %llu, avail %llu",
+ (unsigned long long) stats.f_blocks,
+ (unsigned long long) stats.f_bavail);
+
+ /* set up caching limits */
+ do_div(stats.f_files, 100);
+ cache->fstop = stats.f_files * cache->fstop_percent;
+ cache->fcull = stats.f_files * cache->fcull_percent;
+ cache->frun = stats.f_files * cache->frun_percent;
+
+ _debug("limits {%llu,%llu,%llu} files",
+ (unsigned long long) cache->frun,
+ (unsigned long long) cache->fcull,
+ (unsigned long long) cache->fstop);
+
+ stats.f_blocks >>= cache->bshift;
+ do_div(stats.f_blocks, 100);
+ cache->bstop = stats.f_blocks * cache->bstop_percent;
+ cache->bcull = stats.f_blocks * cache->bcull_percent;
+ cache->brun = stats.f_blocks * cache->brun_percent;
+
+ _debug("limits {%llu,%llu,%llu} blocks",
+ (unsigned long long) cache->brun,
+ (unsigned long long) cache->bcull,
+ (unsigned long long) cache->bstop);
+
+ /* get the cache directory and check its type */
+ cachedir = cachefiles_get_directory(cache, root, "cache");
+ if (IS_ERR(cachedir)) {
+ ret = PTR_ERR(cachedir);
+ goto error_unsupported;
+ }
+
+ fsdef->dentry = cachedir;
+ fsdef->fscache.cookie = NULL;
+
+ ret = cachefiles_check_object_type(fsdef);
+ if (ret < 0)
+ goto error_unsupported;
+
+ /* get the graveyard directory */
+ graveyard = cachefiles_get_directory(cache, root, "graveyard");
+ if (IS_ERR(graveyard)) {
+ ret = PTR_ERR(graveyard);
+ goto error_unsupported;
+ }
+
+ cache->graveyard = graveyard;
+
+ /* publish the cache */
+ fscache_init_cache(&cache->cache,
+ &cachefiles_cache_ops,
+ "%s",
+ fsdef->dentry->d_sb->s_id);
+
+ fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+
+ ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+ if (ret < 0)
+ goto error_add_cache;
+
+ /* done */
+ set_bit(CACHEFILES_READY, &cache->flags);
+ dput(root);
+
+ printk(KERN_INFO "CacheFiles:"
+ " File cache on %s registered\n",
+ cache->cache.identifier);
+
+ /* check how much space the cache has */
+ cachefiles_has_space(cache, 0, 0);
+ cachefiles_end_secure(cache, saved_cred);
+ return 0;
+
+error_add_cache:
+ dput(cache->graveyard);
+ cache->graveyard = NULL;
+error_unsupported:
+ mntput(cache->mnt);
+ cache->mnt = NULL;
+ dput(fsdef->dentry);
+ fsdef->dentry = NULL;
+ dput(root);
+error_open_root:
+ kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+ cachefiles_end_secure(cache, saved_cred);
+ kerror("Failed to register: %d", ret);
+ return ret;
+}
+
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+ _enter("");
+
+ if (test_bit(CACHEFILES_READY, &cache->flags)) {
+ printk(KERN_INFO "CacheFiles:"
+ " File cache on %s unregistering\n",
+ cache->cache.identifier);
+
+ fscache_withdraw_cache(&cache->cache);
+ }
+
+ dput(cache->graveyard);
+ mntput(cache->mnt);
+
+ kfree(cache->rootdirname);
+ kfree(cache->secctx);
+ kfree(cache->tag);
+
+ _leave("");
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644
index 00000000000..4618516dd99
--- /dev/null
+++ b/fs/cachefiles/daemon.c
@@ -0,0 +1,755 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+ loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+ size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+ struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+
+static unsigned long cachefiles_open;
+
+const struct file_operations cachefiles_daemon_fops = {
+ .owner = THIS_MODULE,
+ .open = cachefiles_daemon_open,
+ .release = cachefiles_daemon_release,
+ .read = cachefiles_daemon_read,
+ .write = cachefiles_daemon_write,
+ .poll = cachefiles_daemon_poll,
+};
+
+struct cachefiles_daemon_cmd {
+ char name[8];
+ int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+ { "bind", cachefiles_daemon_bind },
+ { "brun", cachefiles_daemon_brun },
+ { "bcull", cachefiles_daemon_bcull },
+ { "bstop", cachefiles_daemon_bstop },
+ { "cull", cachefiles_daemon_cull },
+ { "debug", cachefiles_daemon_debug },
+ { "dir", cachefiles_daemon_dir },
+ { "frun", cachefiles_daemon_frun },
+ { "fcull", cachefiles_daemon_fcull },
+ { "fstop", cachefiles_daemon_fstop },
+ { "inuse", cachefiles_daemon_inuse },
+ { "secctx", cachefiles_daemon_secctx },
+ { "tag", cachefiles_daemon_tag },
+ { "", NULL }
+};
+
+
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+ struct cachefiles_cache *cache;
+
+ _enter("");
+
+ /* only the superuser may do this */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* the cachefiles device may only be open once at a time */
+ if (xchg(&cachefiles_open, 1) == 1)
+ return -EBUSY;
+
+ /* allocate a cache record */
+ cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+ if (!cache) {
+ cachefiles_open = 0;
+ return -ENOMEM;
+ }
+
+ mutex_init(&cache->daemon_mutex);
+ cache->active_nodes = RB_ROOT;
+ rwlock_init(&cache->active_lock);
+ init_waitqueue_head(&cache->daemon_pollwq);
+
+ /* set default caching limits
+ * - limit at 1% free space and/or free files
+ * - cull below 5% free space and/or free files
+ * - cease culling above 7% free space and/or free files
+ */
+ cache->frun_percent = 7;
+ cache->fcull_percent = 5;
+ cache->fstop_percent = 1;
+ cache->brun_percent = 7;
+ cache->bcull_percent = 5;
+ cache->bstop_percent = 1;
+
+ file->private_data = cache;
+ cache->cachefilesd = file;
+ return 0;
+}
+
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+ struct cachefiles_cache *cache = file->private_data;
+
+ _enter("");
+
+ ASSERT(cache);
+
+ set_bit(CACHEFILES_DEAD, &cache->flags);
+
+ cachefiles_daemon_unbind(cache);
+
+ ASSERT(!cache->active_nodes.rb_node);
+
+ /* clean up the control file interface */
+ cache->cachefilesd = NULL;
+ file->private_data = NULL;
+ cachefiles_open = 0;
+
+ kfree(cache);
+
+ _leave("");
+ return 0;
+}
+
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+ size_t buflen, loff_t *pos)
+{
+ struct cachefiles_cache *cache = file->private_data;
+ char buffer[256];
+ int n;
+
+ //_enter(",,%zu,", buflen);
+
+ if (!test_bit(CACHEFILES_READY, &cache->flags))
+ return 0;
+
+ /* check how much space the cache has */
+ cachefiles_has_space(cache, 0, 0);
+
+ /* summarise */
+ clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+
+ n = snprintf(buffer, sizeof(buffer),
+ "cull=%c"
+ " frun=%llx"
+ " fcull=%llx"
+ " fstop=%llx"
+ " brun=%llx"
+ " bcull=%llx"
+ " bstop=%llx",
+ test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+ (unsigned long long) cache->frun,
+ (unsigned long long) cache->fcull,
+ (unsigned long long) cache->fstop,
+ (unsigned long long) cache->brun,
+ (unsigned long long) cache->bcull,
+ (unsigned long long) cache->bstop
+ );
+
+ if (n > buflen)
+ return -EMSGSIZE;
+
+ if (copy_to_user(_buffer, buffer, n) != 0)
+ return -EFAULT;
+
+ return n;
+}
+
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+ const char __user *_data,
+ size_t datalen,
+ loff_t *pos)
+{
+ const struct cachefiles_daemon_cmd *cmd;
+ struct cachefiles_cache *cache = file->private_data;
+ ssize_t ret;
+ char *data, *args, *cp;
+
+ //_enter(",,%zu,", datalen);
+
+ ASSERT(cache);
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags))
+ return -EIO;
+
+ if (datalen < 0 || datalen > PAGE_SIZE - 1)
+ return -EOPNOTSUPP;
+
+ /* drag the command string into the kernel so we can parse it */
+ data = kmalloc(datalen + 1, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(data, _data, datalen) != 0)
+ goto error;
+
+ data[datalen] = '\0';
+
+ ret = -EINVAL;
+ if (memchr(data, '\0', datalen))
+ goto error;
+
+ /* strip any newline */
+ cp = memchr(data, '\n', datalen);
+ if (cp) {
+ if (cp == data)
+ goto error;
+
+ *cp = '\0';
+ }
+
+ /* parse the command */
+ ret = -EOPNOTSUPP;
+
+ for (args = data; *args; args++)
+ if (isspace(*args))
+ break;
+ if (*args) {
+ if (args == data)
+ goto error;
+ *args = '\0';
+ for (args++; isspace(*args); args++)
+ continue;
+ }
+
+ /* run the appropriate command handler */
+ for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+ if (strcmp(cmd->name, data) == 0)
+ goto found_command;
+
+error:
+ kfree(data);
+ //_leave(" = %zd", ret);
+ return ret;
+
+found_command:
+ mutex_lock(&cache->daemon_mutex);
+
+ ret = -EIO;
+ if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+ ret = cmd->handler(cache, args);
+
+ mutex_unlock(&cache->daemon_mutex);
+
+ if (ret == 0)
+ ret = datalen;
+ goto error;
+}
+
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+ struct poll_table_struct *poll)
+{
+ struct cachefiles_cache *cache = file->private_data;
+ unsigned int mask;
+
+ poll_wait(file, &cache->daemon_pollwq, poll);
+ mask = 0;
+
+ if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+ mask |= POLLIN;
+
+ if (test_bit(CACHEFILES_CULLING, &cache->flags))
+ mask |= POLLOUT;
+
+ return mask;
+}
+
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+ char *args)
+{
+ kerror("Free space limits must be in range"
+ " 0%%<=stop<cull<run<100%%");
+
+ return -EINVAL;
+}
+
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long frun;
+
+ _enter(",%s", args);
+
+ if (!*args)
+ return -EINVAL;
+
+ frun = simple_strtoul(args, &args, 10);
+ if (args[0] != '%' || args[1] != '\0')
+ return -EINVAL;
+
+ if (frun <= cache->fcull_percent || frun >= 100)
+ return cachefiles_daemon_range_error(cache, args);
+
+ cache->frun_percent = frun;
+ return 0;
+}
+
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long fcull;
+
+ _enter(",%s", args);
+
+ if (!*args)
+ return -EINVAL;
+
+ fcull = simple_strtoul(args, &args, 10);
+ if (args[0] != '%' || args[1] != '\0')
+ return -EINVAL;
+
+ if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+ return cachefiles_daemon_range_error(cache, args);
+
+ cache->fcull_percent = fcull;
+ return 0;
+}
+
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long fstop;
+
+ _enter(",%s", args);
+
+ if (!*args)
+ return -EINVAL;
+
+ fstop = simple_strtoul(args, &args, 10);
+ if (args[0] != '%' || args[1] != '\0')
+ return -EINVAL;
+
+ if (fstop < 0 || fstop >= cache->fcull_percent)
+ return cachefiles_daemon_range_error(cache, args);
+
+ cache->fstop_percent = fstop;
+ return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long brun;
+
+ _enter(",%s", args);
+
+ if (!*args)
+ return -EINVAL;
+
+ brun = simple_strtoul(args, &args, 10);
+ if (args[0] != '%' || args[1] != '\0')
+ return -EINVAL;
+
+ if (brun <= cache->bcull_percent || brun >= 100)
+ return cachefiles_daemon_range_error(cache, args);
+
+ cache->brun_percent = brun;
+ return 0;
+}
+
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long bcull;
+
+ _enter(",%s", args);
+
+ if (!*args)
+ return -EINVAL;
+
+ bcull = simple_strtoul(args, &args, 10);
+ if (args[0] != '%' || args[1] != '\0')
+ return -EINVAL;
+
+ if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+ return cachefiles_daemon_range_error(cache, args);
+
+ cache->bcull_percent = bcull;
+ return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long bstop;
+
+ _enter(",%s", args);
+
+ if (!*args)
+ return -EINVAL;
+
+ bstop = simple_strtoul(args, &args, 10);
+ if (args[0] != '%' || args[1] != '\0')
+ return -EINVAL;
+
+ if (bstop < 0 || bstop >= cache->bcull_percent)
+ return cachefiles_daemon_range_error(cache, args);
+
+ cache->bstop_percent = bstop;
+ return 0;
+}
+
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+ char *dir;
+
+ _enter(",%s", args);
+
+ if (!*args) {
+ kerror("Empty directory specified");
+ return -EINVAL;
+ }
+
+ if (cache->rootdirname) {
+ kerror("Second cache directory specified");
+ return -EEXIST;
+ }
+
+ dir = kstrdup(args, GFP_KERNEL);
+ if (!dir)
+ return -ENOMEM;
+
+ cache->rootdirname = dir;
+ return 0;
+}
+
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+ char *secctx;
+
+ _enter(",%s", args);
+
+ if (!*args) {
+ kerror("Empty security context specified");
+ return -EINVAL;
+ }
+
+ if (cache->secctx) {
+ kerror("Second security context specified");
+ return -EINVAL;
+ }
+
+ secctx = kstrdup(args, GFP_KERNEL);
+ if (!secctx)
+ return -ENOMEM;
+
+ cache->secctx = secctx;
+ return 0;
+}
+
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+ char *tag;
+
+ _enter(",%s", args);
+
+ if (!*args) {
+ kerror("Empty tag specified");
+ return -EINVAL;
+ }
+
+ if (cache->tag)
+ return -EEXIST;
+
+ tag = kstrdup(args, GFP_KERNEL);
+ if (!tag)
+ return -ENOMEM;
+
+ cache->tag = tag;
+ return 0;
+}
+
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+ struct fs_struct *fs;
+ struct dentry *dir;
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter(",%s", args);
+
+ if (strchr(args, '/'))
+ goto inval;
+
+ if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+ kerror("cull applied to unready cache");
+ return -EIO;
+ }
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ kerror("cull applied to dead cache");
+ return -EIO;
+ }
+
+ /* extract the directory dentry from the cwd */
+ fs = current->fs;
+ read_lock(&fs->lock);
+ dir = dget(fs->pwd.dentry);
+ read_unlock(&fs->lock);
+
+ if (!S_ISDIR(dir->d_inode->i_mode))
+ goto notdir;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = cachefiles_cull(cache, dir, args);
+ cachefiles_end_secure(cache, saved_cred);
+
+ dput(dir);
+ _leave(" = %d", ret);
+ return ret;
+
+notdir:
+ dput(dir);
+ kerror("cull command requires dirfd to be a directory");
+ return -ENOTDIR;
+
+inval:
+ kerror("cull command requires dirfd and filename");
+ return -EINVAL;
+}
+
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+ unsigned long mask;
+
+ _enter(",%s", args);
+
+ mask = simple_strtoul(args, &args, 0);
+ if (args[0] != '\0')
+ goto inval;
+
+ cachefiles_debug = mask;
+ _leave(" = 0");
+ return 0;
+
+inval:
+ kerror("debug command requires mask");
+ return -EINVAL;
+}
+
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+ struct fs_struct *fs;
+ struct dentry *dir;
+ const struct cred *saved_cred;
+ int ret;
+
+ //_enter(",%s", args);
+
+ if (strchr(args, '/'))
+ goto inval;
+
+ if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+ kerror("inuse applied to unready cache");
+ return -EIO;
+ }
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ kerror("inuse applied to dead cache");
+ return -EIO;
+ }
+
+ /* extract the directory dentry from the cwd */
+ fs = current->fs;
+ read_lock(&fs->lock);
+ dir = dget(fs->pwd.dentry);
+ read_unlock(&fs->lock);
+
+ if (!S_ISDIR(dir->d_inode->i_mode))
+ goto notdir;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = cachefiles_check_in_use(cache, dir, args);
+ cachefiles_end_secure(cache, saved_cred);
+
+ dput(dir);
+ //_leave(" = %d", ret);
+ return ret;
+
+notdir:
+ dput(dir);
+ kerror("inuse command requires dirfd to be a directory");
+ return -ENOTDIR;
+
+inval:
+ kerror("inuse command requires dirfd and filename");
+ return -EINVAL;
+}
+
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+ unsigned fnr, unsigned bnr)
+{
+ struct kstatfs stats;
+ int ret;
+
+ //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+ // (unsigned long long) cache->frun,
+ // (unsigned long long) cache->fcull,
+ // (unsigned long long) cache->fstop,
+ // (unsigned long long) cache->brun,
+ // (unsigned long long) cache->bcull,
+ // (unsigned long long) cache->bstop,
+ // fnr, bnr);
+
+ /* find out how many pages of blockdev are available */
+ memset(&stats, 0, sizeof(stats));
+
+ ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+ if (ret < 0) {
+ if (ret == -EIO)
+ cachefiles_io_error(cache, "statfs failed");
+ _leave(" = %d", ret);
+ return ret;
+ }
+
+ stats.f_bavail >>= cache->bshift;
+
+ //_debug("avail %llu,%llu",
+ // (unsigned long long) stats.f_ffree,
+ // (unsigned long long) stats.f_bavail);
+
+ /* see if there is sufficient space */
+ if (stats.f_ffree > fnr)
+ stats.f_ffree -= fnr;
+ else
+ stats.f_ffree = 0;
+
+ if (stats.f_bavail > bnr)
+ stats.f_bavail -= bnr;
+ else
+ stats.f_bavail = 0;
+
+ ret = -ENOBUFS;
+ if (stats.f_ffree < cache->fstop ||
+ stats.f_bavail < cache->bstop)
+ goto begin_cull;
+
+ ret = 0;
+ if (stats.f_ffree < cache->fcull ||
+ stats.f_bavail < cache->bcull)
+ goto begin_cull;
+
+ if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+ stats.f_ffree >= cache->frun &&
+ stats.f_bavail >= cache->brun &&
+ test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+ ) {
+ _debug("cease culling");
+ cachefiles_state_changed(cache);
+ }
+
+ //_leave(" = 0");
+ return 0;
+
+begin_cull:
+ if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+ _debug("### CULL CACHE ###");
+ cachefiles_state_changed(cache);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644
index 00000000000..1e962348d11
--- /dev/null
+++ b/fs/cachefiles/interface.c
@@ -0,0 +1,449 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+struct cachefiles_lookup_data {
+ struct cachefiles_xattr *auxdata; /* auxiliary data */
+ char *key; /* key path */
+};
+
+static int cachefiles_attr_changed(struct fscache_object *_object);
+
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+ struct fscache_cache *_cache,
+ struct fscache_cookie *cookie)
+{
+ struct cachefiles_lookup_data *lookup_data;
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct cachefiles_xattr *auxdata;
+ unsigned keylen, auxlen;
+ void *buffer;
+ char *key;
+
+ cache = container_of(_cache, struct cachefiles_cache, cache);
+
+ _enter("{%s},%p,", cache->cache.identifier, cookie);
+
+ lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+ if (!lookup_data)
+ goto nomem_lookup_data;
+
+ /* create a new object record and a temporary leaf image */
+ object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+ if (!object)
+ goto nomem_object;
+
+ ASSERTCMP(object->backer, ==, NULL);
+
+ BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+ atomic_set(&object->usage, 1);
+
+ fscache_object_init(&object->fscache, cookie, &cache->cache);
+
+ object->type = cookie->def->type;
+
+ /* get hold of the raw key
+ * - stick the length on the front and leave space on the back for the
+ * encoder
+ */
+ buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+ if (!buffer)
+ goto nomem_buffer;
+
+ keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+ ASSERTCMP(keylen, <, 512);
+
+ *(uint16_t *)buffer = keylen;
+ ((char *)buffer)[keylen + 2] = 0;
+ ((char *)buffer)[keylen + 3] = 0;
+ ((char *)buffer)[keylen + 4] = 0;
+
+ /* turn the raw key into something that can work with as a filename */
+ key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+ if (!key)
+ goto nomem_key;
+
+ /* get hold of the auxiliary data and prepend the object type */
+ auxdata = buffer;
+ auxlen = 0;
+ if (cookie->def->get_aux) {
+ auxlen = cookie->def->get_aux(cookie->netfs_data,
+ auxdata->data, 511);
+ ASSERTCMP(auxlen, <, 511);
+ }
+
+ auxdata->len = auxlen + 1;
+ auxdata->type = cookie->def->type;
+
+ lookup_data->auxdata = auxdata;
+ lookup_data->key = key;
+ object->lookup_data = lookup_data;
+
+ _leave(" = %p [%p]", &object->fscache, lookup_data);
+ return &object->fscache;
+
+nomem_key:
+ kfree(buffer);
+nomem_buffer:
+ BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+ kmem_cache_free(cachefiles_object_jar, object);
+ fscache_object_destroyed(&cache->cache);
+nomem_object:
+ kfree(lookup_data);
+nomem_lookup_data:
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * attempt to look up the nominated node in this cache
+ */
+static void cachefiles_lookup_object(struct fscache_object *_object)
+{
+ struct cachefiles_lookup_data *lookup_data;
+ struct cachefiles_object *parent, *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter("{OBJ%x}", _object->debug_id);
+
+ cache = container_of(_object->cache, struct cachefiles_cache, cache);
+ parent = container_of(_object->parent,
+ struct cachefiles_object, fscache);
+ object = container_of(_object, struct cachefiles_object, fscache);
+ lookup_data = object->lookup_data;
+
+ ASSERTCMP(lookup_data, !=, NULL);
+
+ /* look up the key, creating any missing bits */
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = cachefiles_walk_to_object(parent, object,
+ lookup_data->key,
+ lookup_data->auxdata);
+ cachefiles_end_secure(cache, saved_cred);
+
+ /* polish off by setting the attributes of non-index files */
+ if (ret == 0 &&
+ object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+ cachefiles_attr_changed(&object->fscache);
+
+ if (ret < 0) {
+ printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
+ ret);
+ fscache_object_lookup_error(&object->fscache);
+ }
+
+ _leave(" [%d]", ret);
+}
+
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+ struct cachefiles_object *object;
+
+ object = container_of(_object, struct cachefiles_object, fscache);
+
+ _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+
+ if (object->lookup_data) {
+ kfree(object->lookup_data->key);
+ kfree(object->lookup_data->auxdata);
+ kfree(object->lookup_data);
+ object->lookup_data = NULL;
+ }
+}
+
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+ struct cachefiles_object *object =
+ container_of(_object, struct cachefiles_object, fscache);
+
+ _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+ ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+ atomic_inc(&object->usage);
+ return &object->fscache;
+}
+
+/*
+ * update the auxilliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_xattr *auxdata;
+ struct cachefiles_cache *cache;
+ struct fscache_cookie *cookie;
+ const struct cred *saved_cred;
+ unsigned auxlen;
+
+ _enter("{OBJ%x}", _object->debug_id);
+
+ object = container_of(_object, struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache, struct cachefiles_cache,
+ cache);
+ cookie = object->fscache.cookie;
+
+ if (!cookie->def->get_aux) {
+ _leave(" [no aux]");
+ return;
+ }
+
+ auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+ if (!auxdata) {
+ _leave(" [nomem]");
+ return;
+ }
+
+ auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+ ASSERTCMP(auxlen, <, 511);
+
+ auxdata->len = auxlen + 1;
+ auxdata->type = cookie->def->type;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ cachefiles_update_object_xattr(object, auxdata);
+ cachefiles_end_secure(cache, saved_cred);
+ kfree(auxdata);
+ _leave("");
+}
+
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+
+ ASSERT(_object);
+
+ object = container_of(_object, struct cachefiles_object, fscache);
+
+ _enter("{OBJ%x,%d}",
+ object->fscache.debug_id, atomic_read(&object->usage));
+
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+#ifdef CACHEFILES_DEBUG_SLAB
+ ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+ /* delete retired objects */
+ if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+ _object != cache->cache.fsdef
+ ) {
+ _debug("- retire object OBJ%x", object->fscache.debug_id);
+ cachefiles_begin_secure(cache, &saved_cred);
+ cachefiles_delete_object(cache, object);
+ cachefiles_end_secure(cache, saved_cred);
+ }
+
+ /* close the filesystem stuff attached to the object */
+ if (object->backer != object->dentry)
+ dput(object->backer);
+ object->backer = NULL;
+
+ /* note that the object is now inactive */
+ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+ write_lock(&cache->active_lock);
+ if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+ &object->flags))
+ BUG();
+ rb_erase(&object->active_node, &cache->active_nodes);
+ wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+ write_unlock(&cache->active_lock);
+ }
+
+ dput(object->dentry);
+ object->dentry = NULL;
+
+ _leave("");
+}
+
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+ struct cachefiles_object *object;
+ struct fscache_cache *cache;
+
+ ASSERT(_object);
+
+ object = container_of(_object, struct cachefiles_object, fscache);
+
+ _enter("{OBJ%x,%d}",
+ object->fscache.debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+ ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+ ASSERTIFCMP(object->fscache.parent,
+ object->fscache.parent->n_children, >, 0);
+
+ if (atomic_dec_and_test(&object->usage)) {
+ _debug("- kill object OBJ%x", object->fscache.debug_id);
+
+ ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+ ASSERTCMP(object->fscache.parent, ==, NULL);
+ ASSERTCMP(object->backer, ==, NULL);
+ ASSERTCMP(object->dentry, ==, NULL);
+ ASSERTCMP(object->fscache.n_ops, ==, 0);
+ ASSERTCMP(object->fscache.n_children, ==, 0);
+
+ if (object->lookup_data) {
+ kfree(object->lookup_data->key);
+ kfree(object->lookup_data->auxdata);
+ kfree(object->lookup_data);
+ object->lookup_data = NULL;
+ }
+
+ cache = object->fscache.cache;
+ kmem_cache_free(cachefiles_object_jar, object);
+ fscache_object_destroyed(cache);
+ }
+
+ _leave("");
+}
+
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ int ret;
+
+ _enter("%p", _cache);
+
+ cache = container_of(_cache, struct cachefiles_cache, cache);
+
+ /* make sure all pages pinned by operations on behalf of the netfs are
+ * written to disc */
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = fsync_super(cache->mnt->mnt_sb);
+ cachefiles_end_secure(cache, saved_cred);
+
+ if (ret == -EIO)
+ cachefiles_io_error(cache,
+ "Attempt to sync backing fs superblock"
+ " returned error %d",
+ ret);
+}
+
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ struct iattr newattrs;
+ uint64_t ni_size;
+ loff_t oi_size;
+ int ret;
+
+ _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+
+ _enter("{OBJ%x},[%llu]",
+ _object->debug_id, (unsigned long long) ni_size);
+
+ object = container_of(_object, struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ if (ni_size == object->i_size)
+ return 0;
+
+ if (!object->backer)
+ return -ENOBUFS;
+
+ ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+ fscache_set_store_limit(&object->fscache, ni_size);
+
+ oi_size = i_size_read(object->backer->d_inode);
+ if (oi_size == ni_size)
+ return 0;
+
+ newattrs.ia_size = ni_size;
+ newattrs.ia_valid = ATTR_SIZE;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ mutex_lock(&object->backer->d_inode->i_mutex);
+ ret = notify_change(object->backer, &newattrs);
+ mutex_unlock(&object->backer->d_inode->i_mutex);
+ cachefiles_end_secure(cache, saved_cred);
+
+ if (ret == -EIO) {
+ fscache_set_store_limit(&object->fscache, 0);
+ cachefiles_io_error_obj(object, "Size set failed");
+ ret = -ENOBUFS;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+ _enter("");
+}
+
+const struct fscache_cache_ops cachefiles_cache_ops = {
+ .name = "cachefiles",
+ .alloc_object = cachefiles_alloc_object,
+ .lookup_object = cachefiles_lookup_object,
+ .lookup_complete = cachefiles_lookup_complete,
+ .grab_object = cachefiles_grab_object,
+ .update_object = cachefiles_update_object,
+ .drop_object = cachefiles_drop_object,
+ .put_object = cachefiles_put_object,
+ .sync_cache = cachefiles_sync_cache,
+ .attr_changed = cachefiles_attr_changed,
+ .read_or_alloc_page = cachefiles_read_or_alloc_page,
+ .read_or_alloc_pages = cachefiles_read_or_alloc_pages,
+ .allocate_page = cachefiles_allocate_page,
+ .allocate_pages = cachefiles_allocate_pages,
+ .write_page = cachefiles_write_page,
+ .uncache_page = cachefiles_uncache_page,
+ .dissociate_pages = cachefiles_dissociate_pages,
+};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644
index 00000000000..19218e1463d
--- /dev/null
+++ b/fs/cachefiles/internal.h
@@ -0,0 +1,360 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+
+struct cachefiles_cache;
+struct cachefiles_object;
+
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER 1
+#define CACHEFILES_DEBUG_KLEAVE 2
+#define CACHEFILES_DEBUG_KDEBUG 4
+
+/*
+ * node records
+ */
+struct cachefiles_object {
+ struct fscache_object fscache; /* fscache handle */
+ struct cachefiles_lookup_data *lookup_data; /* cached lookup data */
+ struct dentry *dentry; /* the file/dir representing this object */
+ struct dentry *backer; /* backing file */
+ loff_t i_size; /* object size */
+ unsigned long flags;
+#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
+ atomic_t usage; /* object usage count */
+ uint8_t type; /* object type */
+ uint8_t new; /* T if object new */
+ spinlock_t work_lock;
+ struct rb_node active_node; /* link in active tree (dentry is key) */
+};
+
+extern struct kmem_cache *cachefiles_object_jar;
+
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+ struct fscache_cache cache; /* FS-Cache record */
+ struct vfsmount *mnt; /* mountpoint holding the cache */
+ struct dentry *graveyard; /* directory into which dead objects go */
+ struct file *cachefilesd; /* manager daemon handle */
+ const struct cred *cache_cred; /* security override for accessing cache */
+ struct mutex daemon_mutex; /* command serialisation mutex */
+ wait_queue_head_t daemon_pollwq; /* poll waitqueue for daemon */
+ struct rb_root active_nodes; /* active nodes (can't be culled) */
+ rwlock_t active_lock; /* lock for active_nodes */
+ atomic_t gravecounter; /* graveyard uniquifier */
+ unsigned frun_percent; /* when to stop culling (% files) */
+ unsigned fcull_percent; /* when to start culling (% files) */
+ unsigned fstop_percent; /* when to stop allocating (% files) */
+ unsigned brun_percent; /* when to stop culling (% blocks) */
+ unsigned bcull_percent; /* when to start culling (% blocks) */
+ unsigned bstop_percent; /* when to stop allocating (% blocks) */
+ unsigned bsize; /* cache's block size */
+ unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */
+ uint64_t frun; /* when to stop culling */
+ uint64_t fcull; /* when to start culling */
+ uint64_t fstop; /* when to stop allocating */
+ sector_t brun; /* when to stop culling */
+ sector_t bcull; /* when to start culling */
+ sector_t bstop; /* when to stop allocating */
+ unsigned long flags;
+#define CACHEFILES_READY 0 /* T if cache prepared */
+#define CACHEFILES_DEAD 1 /* T if cache dead */
+#define CACHEFILES_CULLING 2 /* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
+ char *rootdirname; /* name of cache root directory */
+ char *secctx; /* LSM security context */
+ char *tag; /* cache binding tag */
+};
+
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+ wait_queue_t monitor; /* link into monitored waitqueue */
+ struct page *back_page; /* backing file page we're waiting for */
+ struct page *netfs_page; /* netfs page we're going to fill */
+ struct fscache_retrieval *op; /* retrieval op covering this */
+ struct list_head op_link; /* link in op's todo list */
+};
+
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+ struct page *netfs_page; /* netfs page to copy */
+ struct cachefiles_object *object;
+ struct list_head obj_link; /* link in object's lists */
+ fscache_rw_complete_t end_io_func;
+ void *context;
+};
+
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+ uint16_t len;
+ uint8_t type;
+ uint8_t data[];
+};
+
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+ set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+ wake_up_all(&cache->daemon_pollwq);
+}
+
+/*
+ * cf-bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+
+/*
+ * cf-daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+ unsigned fnr, unsigned bnr);
+
+/*
+ * cf-interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+
+/*
+ * cf-key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+
+/*
+ * cf-namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+ struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+ struct cachefiles_object *object,
+ const char *key,
+ struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+ struct dentry *dir,
+ const char *name);
+
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+ char *filename);
+
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+ struct dentry *dir, char *filename);
+
+/*
+ * cf-proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+ unsigned long jif = jiffies - start_jif;
+ if (jif >= HZ)
+ jif = HZ - 1;
+ atomic_inc(&histogram[jif]);
+}
+
+#else
+#define cachefiles_proc_init() (0)
+#define cachefiles_proc_cleanup() do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * cf-rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+ struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+ struct list_head *, unsigned *,
+ gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+ gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+ struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+
+/*
+ * cf-security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+ struct dentry *root,
+ const struct cred **_saved_cred);
+
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+ const struct cred **_saved_cred)
+{
+ *_saved_cred = override_creds(cache->cache_cred);
+}
+
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+ const struct cred *saved_cred)
+{
+ revert_creds(saved_cred);
+}
+
+/*
+ * cf-xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+ struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+ struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+ struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+ struct dentry *dentry);
+
+
+/*
+ * error handling
+ */
+#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
+
+#define cachefiles_io_error(___cache, FMT, ...) \
+do { \
+ kerror("I/O Error: " FMT, ##__VA_ARGS__); \
+ fscache_io_error(&(___cache)->cache); \
+ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
+} while (0)
+
+#define cachefiles_io_error_obj(object, FMT, ...) \
+do { \
+ struct cachefiles_cache *___cache; \
+ \
+ ___cache = container_of((object)->fscache.cache, \
+ struct cachefiles_cache, cache); \
+ cachefiles_io_error(___cache, FMT, ##__VA_ARGS__); \
+} while (0)
+
+
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+ printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline void _dbprintk(const char *fmt, ...)
+ __attribute__((format(printf, 1, 2)));
+static inline void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...) \
+do { \
+ if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
+ kenter(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#define _leave(FMT, ...) \
+do { \
+ if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
+ kleave(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#define _debug(FMT, ...) \
+do { \
+ if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
+ kdebug(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
+ printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
+ printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644
index 00000000000..81b8b2b3a67
--- /dev/null
+++ b/fs/cachefiles/key.c
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_charmap[64] =
+ "0123456789" /* 0 - 9 */
+ "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
+ "_-" /* 62 - 63 */
+ ;
+
+static const char cachefiles_filecharmap[256] = {
+ /* we skip space and tab and control chars */
+ [33 ... 46] = 1, /* '!' -> '.' */
+ /* we skip '/' as it's significant to pathwalk */
+ [48 ... 127] = 1, /* '0' -> '~' */
+};
+
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ * - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ * cooked
+ * - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+ unsigned char csum, ch;
+ unsigned int acc;
+ char *key;
+ int loop, len, max, seg, mark, print;
+
+ _enter(",%d", keylen);
+
+ BUG_ON(keylen < 2 || keylen > 514);
+
+ csum = raw[0] + raw[1];
+ print = 1;
+ for (loop = 2; loop < keylen; loop++) {
+ ch = raw[loop];
+ csum += ch;
+ print &= cachefiles_filecharmap[ch];
+ }
+
+ if (print) {
+ /* if the path is usable ASCII, then we render it directly */
+ max = keylen - 2;
+ max += 2; /* two base64'd length chars on the front */
+ max += 5; /* @checksum/M */
+ max += 3 * 2; /* maximum number of segment dividers (".../M")
+ * is ((514 + 251) / 252) = 3
+ */
+ max += 1; /* NUL on end */
+ } else {
+ /* calculate the maximum length of the cooked key */
+ keylen = (keylen + 2) / 3;
+
+ max = keylen * 4;
+ max += 5; /* @checksum/M */
+ max += 3 * 2; /* maximum number of segment dividers (".../M")
+ * is ((514 + 188) / 189) = 3
+ */
+ max += 1; /* NUL on end */
+ }
+
+ max += 1; /* 2nd NUL on end */
+
+ _debug("max: %d", max);
+
+ key = kmalloc(max, GFP_KERNEL);
+ if (!key)
+ return NULL;
+
+ len = 0;
+
+ /* build the cooked key */
+ sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+ len = 5;
+ mark = len - 1;
+
+ if (print) {
+ acc = *(uint16_t *) raw;
+ raw += 2;
+
+ key[len + 1] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ key[len] = cachefiles_charmap[acc & 63];
+ len += 2;
+
+ seg = 250;
+ for (loop = keylen; loop > 0; loop--) {
+ if (seg <= 0) {
+ key[len++] = '\0';
+ mark = len;
+ key[len++] = '+';
+ seg = 252;
+ }
+
+ key[len++] = *raw++;
+ ASSERT(len < max);
+ }
+
+ switch (type) {
+ case FSCACHE_COOKIE_TYPE_INDEX: type = 'I'; break;
+ case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'D'; break;
+ default: type = 'S'; break;
+ }
+ } else {
+ seg = 252;
+ for (loop = keylen; loop > 0; loop--) {
+ if (seg <= 0) {
+ key[len++] = '\0';
+ mark = len;
+ key[len++] = '+';
+ seg = 252;
+ }
+
+ acc = *raw++;
+ acc |= *raw++ << 8;
+ acc |= *raw++ << 16;
+
+ _debug("acc: %06x", acc);
+
+ key[len++] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ key[len++] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ key[len++] = cachefiles_charmap[acc & 63];
+ acc >>= 6;
+ key[len++] = cachefiles_charmap[acc & 63];
+
+ ASSERT(len < max);
+ }
+
+ switch (type) {
+ case FSCACHE_COOKIE_TYPE_INDEX: type = 'J'; break;
+ case FSCACHE_COOKIE_TYPE_DATAFILE: type = 'E'; break;
+ default: type = 'T'; break;
+ }
+ }
+
+ key[mark] = type;
+ key[len++] = 0;
+ key[len] = 0;
+
+ _leave(" = %p %d", key, len);
+ return key;
+}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644
index 00000000000..4bfa8cf43bf
--- /dev/null
+++ b/fs/cachefiles/main.c
@@ -0,0 +1,106 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+struct kmem_cache *cachefiles_object_jar;
+
+static struct miscdevice cachefiles_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cachefiles",
+ .fops = &cachefiles_daemon_fops,
+};
+
+static void cachefiles_object_init_once(void *_object)
+{
+ struct cachefiles_object *object = _object;
+
+ memset(object, 0, sizeof(*object));
+ spin_lock_init(&object->work_lock);
+}
+
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+ int ret;
+
+ ret = misc_register(&cachefiles_dev);
+ if (ret < 0)
+ goto error_dev;
+
+ /* create an object jar */
+ ret = -ENOMEM;
+ cachefiles_object_jar =
+ kmem_cache_create("cachefiles_object_jar",
+ sizeof(struct cachefiles_object),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ cachefiles_object_init_once);
+ if (!cachefiles_object_jar) {
+ printk(KERN_NOTICE
+ "CacheFiles: Failed to allocate an object jar\n");
+ goto error_object_jar;
+ }
+
+ ret = cachefiles_proc_init();
+ if (ret < 0)
+ goto error_proc;
+
+ printk(KERN_INFO "CacheFiles: Loaded\n");
+ return 0;
+
+error_proc:
+ kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+ misc_deregister(&cachefiles_dev);
+error_dev:
+ kerror("failed to register: %d", ret);
+ return ret;
+}
+
+fs_initcall(cachefiles_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+ printk(KERN_INFO "CacheFiles: Unloading\n");
+
+ cachefiles_proc_cleanup();
+ kmem_cache_destroy(cachefiles_object_jar);
+ misc_deregister(&cachefiles_dev);
+}
+
+module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644
index 00000000000..4ce818ae39e
--- /dev/null
+++ b/fs/cachefiles/namei.c
@@ -0,0 +1,771 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "internal.h"
+
+static int cachefiles_wait_bit(void *flags)
+{
+ schedule();
+ return 0;
+}
+
+/*
+ * record the fact that an object is now active
+ */
+static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
+ struct cachefiles_object *object)
+{
+ struct cachefiles_object *xobject;
+ struct rb_node **_p, *_parent = NULL;
+ struct dentry *dentry;
+
+ _enter(",%p", object);
+
+try_again:
+ write_lock(&cache->active_lock);
+
+ if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+ BUG();
+
+ dentry = object->dentry;
+ _p = &cache->active_nodes.rb_node;
+ while (*_p) {
+ _parent = *_p;
+ xobject = rb_entry(_parent,
+ struct cachefiles_object, active_node);
+
+ ASSERT(xobject != object);
+
+ if (xobject->dentry > dentry)
+ _p = &(*_p)->rb_left;
+ else if (xobject->dentry < dentry)
+ _p = &(*_p)->rb_right;
+ else
+ goto wait_for_old_object;
+ }
+
+ rb_link_node(&object->active_node, _parent, _p);
+ rb_insert_color(&object->active_node, &cache->active_nodes);
+
+ write_unlock(&cache->active_lock);
+ _leave("");
+ return;
+
+ /* an old object from a previous incarnation is hogging the slot - we
+ * need to wait for it to be destroyed */
+wait_for_old_object:
+ if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+ printk(KERN_ERR "\n");
+ printk(KERN_ERR "CacheFiles: Error:"
+ " Unexpected object collision\n");
+ printk(KERN_ERR "xobject: OBJ%x\n",
+ xobject->fscache.debug_id);
+ printk(KERN_ERR "xobjstate=%s\n",
+ fscache_object_states[xobject->fscache.state]);
+ printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
+ printk(KERN_ERR "xobjevent=%lx [%lx]\n",
+ xobject->fscache.events, xobject->fscache.event_mask);
+ printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
+ xobject->fscache.n_ops, xobject->fscache.n_in_progress,
+ xobject->fscache.n_exclusive);
+ printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
+ xobject->fscache.cookie,
+ xobject->fscache.cookie->parent,
+ xobject->fscache.cookie->netfs_data,
+ xobject->fscache.cookie->flags);
+ printk(KERN_ERR "xparent=%p\n",
+ xobject->fscache.parent);
+ printk(KERN_ERR "object: OBJ%x\n",
+ object->fscache.debug_id);
+ printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
+ object->fscache.cookie,
+ object->fscache.cookie->parent,
+ object->fscache.cookie->netfs_data,
+ object->fscache.cookie->flags);
+ printk(KERN_ERR "parent=%p\n",
+ object->fscache.parent);
+ BUG();
+ }
+ atomic_inc(&xobject->usage);
+ write_unlock(&cache->active_lock);
+
+ _debug(">>> wait");
+ wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
+ cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
+ _debug("<<< waited");
+
+ cache->cache.ops->put_object(&xobject->fscache);
+ goto try_again;
+}
+
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ * delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+ struct dentry *dir,
+ struct dentry *rep)
+{
+ struct dentry *grave, *trap;
+ char nbuffer[8 + 8 + 1];
+ int ret;
+
+ _enter(",'%*.*s','%*.*s'",
+ dir->d_name.len, dir->d_name.len, dir->d_name.name,
+ rep->d_name.len, rep->d_name.len, rep->d_name.name);
+
+ /* non-directories can just be unlinked */
+ if (!S_ISDIR(rep->d_inode->i_mode)) {
+ _debug("unlink stale object");
+ ret = vfs_unlink(dir->d_inode, rep);
+
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ if (ret == -EIO)
+ cachefiles_io_error(cache, "Unlink failed");
+
+ _leave(" = %d", ret);
+ return ret;
+ }
+
+ /* directories have to be moved to the graveyard */
+ _debug("move stale object to graveyard");
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+try_again:
+ /* first step is to make up a grave dentry in the graveyard */
+ sprintf(nbuffer, "%08x%08x",
+ (uint32_t) get_seconds(),
+ (uint32_t) atomic_inc_return(&cache->gravecounter));
+
+ /* do the multiway lock magic */
+ trap = lock_rename(cache->graveyard, dir);
+
+ /* do some checks before getting the grave dentry */
+ if (rep->d_parent != dir) {
+ /* the entry was probably culled when we dropped the parent dir
+ * lock */
+ unlock_rename(cache->graveyard, dir);
+ _leave(" = 0 [culled?]");
+ return 0;
+ }
+
+ if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+ unlock_rename(cache->graveyard, dir);
+ cachefiles_io_error(cache, "Graveyard no longer a directory");
+ return -EIO;
+ }
+
+ if (trap == rep) {
+ unlock_rename(cache->graveyard, dir);
+ cachefiles_io_error(cache, "May not make directory loop");
+ return -EIO;
+ }
+
+ if (d_mountpoint(rep)) {
+ unlock_rename(cache->graveyard, dir);
+ cachefiles_io_error(cache, "Mountpoint in cache");
+ return -EIO;
+ }
+
+ grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+ if (IS_ERR(grave)) {
+ unlock_rename(cache->graveyard, dir);
+
+ if (PTR_ERR(grave) == -ENOMEM) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ cachefiles_io_error(cache, "Lookup error %ld",
+ PTR_ERR(grave));
+ return -EIO;
+ }
+
+ if (grave->d_inode) {
+ unlock_rename(cache->graveyard, dir);
+ dput(grave);
+ grave = NULL;
+ cond_resched();
+ goto try_again;
+ }
+
+ if (d_mountpoint(grave)) {
+ unlock_rename(cache->graveyard, dir);
+ dput(grave);
+ cachefiles_io_error(cache, "Mountpoint in graveyard");
+ return -EIO;
+ }
+
+ /* target should not be an ancestor of source */
+ if (trap == grave) {
+ unlock_rename(cache->graveyard, dir);
+ dput(grave);
+ cachefiles_io_error(cache, "May not make directory loop");
+ return -EIO;
+ }
+
+ /* attempt the rename */
+ ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+ if (ret != 0 && ret != -ENOMEM)
+ cachefiles_io_error(cache, "Rename failed with error %d", ret);
+
+ unlock_rename(cache->graveyard, dir);
+ dput(grave);
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+ struct cachefiles_object *object)
+{
+ struct dentry *dir;
+ int ret;
+
+ _enter(",{%p}", object->dentry);
+
+ ASSERT(object->dentry);
+ ASSERT(object->dentry->d_inode);
+ ASSERT(object->dentry->d_parent);
+
+ dir = dget_parent(object->dentry);
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ ret = cachefiles_bury_object(cache, dir, object->dentry);
+
+ dput(dir);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+ struct cachefiles_object *object,
+ const char *key,
+ struct cachefiles_xattr *auxdata)
+{
+ struct cachefiles_cache *cache;
+ struct dentry *dir, *next = NULL;
+ unsigned long start;
+ const char *name;
+ int ret, nlen;
+
+ _enter("{%p},,%s,", parent->dentry, key);
+
+ cache = container_of(parent->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ ASSERT(parent->dentry);
+ ASSERT(parent->dentry->d_inode);
+
+ if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+ // TODO: convert file to dir
+ _leave("looking up in none directory");
+ return -ENOBUFS;
+ }
+
+ dir = dget(parent->dentry);
+
+advance:
+ /* attempt to transit the first directory component */
+ name = key;
+ nlen = strlen(key);
+
+ /* key ends in a double NUL */
+ key = key + nlen + 1;
+ if (!*key)
+ key = NULL;
+
+lookup_again:
+ /* search the current directory for the element name */
+ _debug("lookup '%s'", name);
+
+ mutex_lock(&dir->d_inode->i_mutex);
+
+ start = jiffies;
+ next = lookup_one_len(name, dir, nlen);
+ cachefiles_hist(cachefiles_lookup_histogram, start);
+ if (IS_ERR(next))
+ goto lookup_error;
+
+ _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+
+ if (!key)
+ object->new = !next->d_inode;
+
+ /* if this element of the path doesn't exist, then the lookup phase
+ * failed, and we can release any readers in the certain knowledge that
+ * there's nothing for them to actually read */
+ if (!next->d_inode)
+ fscache_object_lookup_negative(&object->fscache);
+
+ /* we need to create the object if it's negative */
+ if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+ /* index objects and intervening tree levels must be subdirs */
+ if (!next->d_inode) {
+ ret = cachefiles_has_space(cache, 1, 0);
+ if (ret < 0)
+ goto create_error;
+
+ start = jiffies;
+ ret = vfs_mkdir(dir->d_inode, next, 0);
+ cachefiles_hist(cachefiles_mkdir_histogram, start);
+ if (ret < 0)
+ goto create_error;
+
+ ASSERT(next->d_inode);
+
+ _debug("mkdir -> %p{%p{ino=%lu}}",
+ next, next->d_inode, next->d_inode->i_ino);
+
+ } else if (!S_ISDIR(next->d_inode->i_mode)) {
+ kerror("inode %lu is not a directory",
+ next->d_inode->i_ino);
+ ret = -ENOBUFS;
+ goto error;
+ }
+
+ } else {
+ /* non-index objects start out life as files */
+ if (!next->d_inode) {
+ ret = cachefiles_has_space(cache, 1, 0);
+ if (ret < 0)
+ goto create_error;
+
+ start = jiffies;
+ ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+ cachefiles_hist(cachefiles_create_histogram, start);
+ if (ret < 0)
+ goto create_error;
+
+ ASSERT(next->d_inode);
+
+ _debug("create -> %p{%p{ino=%lu}}",
+ next, next->d_inode, next->d_inode->i_ino);
+
+ } else if (!S_ISDIR(next->d_inode->i_mode) &&
+ !S_ISREG(next->d_inode->i_mode)
+ ) {
+ kerror("inode %lu is not a file or directory",
+ next->d_inode->i_ino);
+ ret = -ENOBUFS;
+ goto error;
+ }
+ }
+
+ /* process the next component */
+ if (key) {
+ _debug("advance");
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(dir);
+ dir = next;
+ next = NULL;
+ goto advance;
+ }
+
+ /* we've found the object we were looking for */
+ object->dentry = next;
+
+ /* if we've found that the terminal object exists, then we need to
+ * check its attributes and delete it if it's out of date */
+ if (!object->new) {
+ _debug("validate '%*.*s'",
+ next->d_name.len, next->d_name.len, next->d_name.name);
+
+ ret = cachefiles_check_object_xattr(object, auxdata);
+ if (ret == -ESTALE) {
+ /* delete the object (the deleter drops the directory
+ * mutex) */
+ object->dentry = NULL;
+
+ ret = cachefiles_bury_object(cache, dir, next);
+ dput(next);
+ next = NULL;
+
+ if (ret < 0)
+ goto delete_error;
+
+ _debug("redo lookup");
+ goto lookup_again;
+ }
+ }
+
+ /* note that we're now using this object */
+ cachefiles_mark_object_active(cache, object);
+
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(dir);
+ dir = NULL;
+
+ _debug("=== OBTAINED_OBJECT ===");
+
+ if (object->new) {
+ /* attach data to a newly constructed terminal object */
+ ret = cachefiles_set_object_xattr(object, auxdata);
+ if (ret < 0)
+ goto check_error;
+ } else {
+ /* always update the atime on an object we've just looked up
+ * (this is used to keep track of culling, and atimes are only
+ * updated by read, write and readdir but not lookup or
+ * open) */
+ touch_atime(cache->mnt, next);
+ }
+
+ /* open a file interface onto a data file */
+ if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (S_ISREG(object->dentry->d_inode->i_mode)) {
+ const struct address_space_operations *aops;
+
+ ret = -EPERM;
+ aops = object->dentry->d_inode->i_mapping->a_ops;
+ if (!aops->bmap)
+ goto check_error;
+
+ object->backer = object->dentry;
+ } else {
+ BUG(); // TODO: open file in data-class subdir
+ }
+ }
+
+ object->new = 0;
+ fscache_obtained_object(&object->fscache);
+
+ _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+ return 0;
+
+create_error:
+ _debug("create error %d", ret);
+ if (ret == -EIO)
+ cachefiles_io_error(cache, "Create/mkdir failed");
+ goto error;
+
+check_error:
+ _debug("check error %d", ret);
+ write_lock(&cache->active_lock);
+ rb_erase(&object->active_node, &cache->active_nodes);
+ clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+ wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+ write_unlock(&cache->active_lock);
+
+ dput(object->dentry);
+ object->dentry = NULL;
+ goto error_out;
+
+delete_error:
+ _debug("delete error %d", ret);
+ goto error_out2;
+
+lookup_error:
+ _debug("lookup error %ld", PTR_ERR(next));
+ ret = PTR_ERR(next);
+ if (ret == -EIO)
+ cachefiles_io_error(cache, "Lookup failed");
+ next = NULL;
+error:
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(next);
+error_out2:
+ dput(dir);
+error_out:
+ if (ret == -ENOSPC)
+ ret = -ENOBUFS;
+
+ _leave(" = error %d", -ret);
+ return ret;
+}
+
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+ struct dentry *dir,
+ const char *dirname)
+{
+ struct dentry *subdir;
+ unsigned long start;
+ int ret;
+
+ _enter(",,%s", dirname);
+
+ /* search the current directory for the element name */
+ mutex_lock(&dir->d_inode->i_mutex);
+
+ start = jiffies;
+ subdir = lookup_one_len(dirname, dir, strlen(dirname));
+ cachefiles_hist(cachefiles_lookup_histogram, start);
+ if (IS_ERR(subdir)) {
+ if (PTR_ERR(subdir) == -ENOMEM)
+ goto nomem_d_alloc;
+ goto lookup_error;
+ }
+
+ _debug("subdir -> %p %s",
+ subdir, subdir->d_inode ? "positive" : "negative");
+
+ /* we need to create the subdir if it doesn't exist yet */
+ if (!subdir->d_inode) {
+ ret = cachefiles_has_space(cache, 1, 0);
+ if (ret < 0)
+ goto mkdir_error;
+
+ _debug("attempt mkdir");
+
+ ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+ if (ret < 0)
+ goto mkdir_error;
+
+ ASSERT(subdir->d_inode);
+
+ _debug("mkdir -> %p{%p{ino=%lu}}",
+ subdir,
+ subdir->d_inode,
+ subdir->d_inode->i_ino);
+ }
+
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ /* we need to make sure the subdir is a directory */
+ ASSERT(subdir->d_inode);
+
+ if (!S_ISDIR(subdir->d_inode->i_mode)) {
+ kerror("%s is not a directory", dirname);
+ ret = -EIO;
+ goto check_error;
+ }
+
+ ret = -EPERM;
+ if (!subdir->d_inode->i_op ||
+ !subdir->d_inode->i_op->setxattr ||
+ !subdir->d_inode->i_op->getxattr ||
+ !subdir->d_inode->i_op->lookup ||
+ !subdir->d_inode->i_op->mkdir ||
+ !subdir->d_inode->i_op->create ||
+ !subdir->d_inode->i_op->rename ||
+ !subdir->d_inode->i_op->rmdir ||
+ !subdir->d_inode->i_op->unlink)
+ goto check_error;
+
+ _leave(" = [%lu]", subdir->d_inode->i_ino);
+ return subdir;
+
+check_error:
+ dput(subdir);
+ _leave(" = %d [check]", ret);
+ return ERR_PTR(ret);
+
+mkdir_error:
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(subdir);
+ kerror("mkdir %s failed with error %d", dirname, ret);
+ return ERR_PTR(ret);
+
+lookup_error:
+ mutex_unlock(&dir->d_inode->i_mutex);
+ ret = PTR_ERR(subdir);
+ kerror("Lookup %s failed with error %d", dirname, ret);
+ return ERR_PTR(ret);
+
+nomem_d_alloc:
+ mutex_unlock(&dir->d_inode->i_mutex);
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ * - returns a pointer to the object and a reference on it
+ * - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+ struct dentry *dir,
+ char *filename)
+{
+ struct cachefiles_object *object;
+ struct rb_node *_n;
+ struct dentry *victim;
+ unsigned long start;
+ int ret;
+
+ //_enter(",%*.*s/,%s",
+ // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+ /* look up the victim */
+ mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+
+ start = jiffies;
+ victim = lookup_one_len(filename, dir, strlen(filename));
+ cachefiles_hist(cachefiles_lookup_histogram, start);
+ if (IS_ERR(victim))
+ goto lookup_error;
+
+ //_debug("victim -> %p %s",
+ // victim, victim->d_inode ? "positive" : "negative");
+
+ /* if the object is no longer there then we probably retired the object
+ * at the netfs's request whilst the cull was in progress
+ */
+ if (!victim->d_inode) {
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(victim);
+ _leave(" = -ENOENT [absent]");
+ return ERR_PTR(-ENOENT);
+ }
+
+ /* check to see if we're using this object */
+ read_lock(&cache->active_lock);
+
+ _n = cache->active_nodes.rb_node;
+
+ while (_n) {
+ object = rb_entry(_n, struct cachefiles_object, active_node);
+
+ if (object->dentry > victim)
+ _n = _n->rb_left;
+ else if (object->dentry < victim)
+ _n = _n->rb_right;
+ else
+ goto object_in_use;
+ }
+
+ read_unlock(&cache->active_lock);
+
+ //_leave(" = %p", victim);
+ return victim;
+
+object_in_use:
+ read_unlock(&cache->active_lock);
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(victim);
+ //_leave(" = -EBUSY [in use]");
+ return ERR_PTR(-EBUSY);
+
+lookup_error:
+ mutex_unlock(&dir->d_inode->i_mutex);
+ ret = PTR_ERR(victim);
+ if (ret == -ENOENT) {
+ /* file or dir now absent - probably retired by netfs */
+ _leave(" = -ESTALE [absent]");
+ return ERR_PTR(-ESTALE);
+ }
+
+ if (ret == -EIO) {
+ cachefiles_io_error(cache, "Lookup failed");
+ } else if (ret != -ENOMEM) {
+ kerror("Internal error: %d", ret);
+ ret = -EIO;
+ }
+
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+ char *filename)
+{
+ struct dentry *victim;
+ int ret;
+
+ _enter(",%*.*s/,%s",
+ dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+ victim = cachefiles_check_active(cache, dir, filename);
+ if (IS_ERR(victim))
+ return PTR_ERR(victim);
+
+ _debug("victim -> %p %s",
+ victim, victim->d_inode ? "positive" : "negative");
+
+ /* okay... the victim is not being used so we can cull it
+ * - start by marking it as stale
+ */
+ _debug("victim is cullable");
+
+ ret = cachefiles_remove_object_xattr(cache, victim);
+ if (ret < 0)
+ goto error_unlock;
+
+ /* actually remove the victim (drops the dir mutex) */
+ _debug("bury");
+
+ ret = cachefiles_bury_object(cache, dir, victim);
+ if (ret < 0)
+ goto error;
+
+ dput(victim);
+ _leave(" = 0");
+ return 0;
+
+error_unlock:
+ mutex_unlock(&dir->d_inode->i_mutex);
+error:
+ dput(victim);
+ if (ret == -ENOENT) {
+ /* file or dir now absent - probably retired by netfs */
+ _leave(" = -ESTALE [absent]");
+ return -ESTALE;
+ }
+
+ if (ret != -ENOMEM) {
+ kerror("Internal error: %d", ret);
+ ret = -EIO;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+ char *filename)
+{
+ struct dentry *victim;
+
+ //_enter(",%*.*s/,%s",
+ // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+ victim = cachefiles_check_active(cache, dir, filename);
+ if (IS_ERR(victim))
+ return PTR_ERR(victim);
+
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(victim);
+ //_leave(" = 0");
+ return 0;
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644
index 00000000000..eccd3394119
--- /dev/null
+++ b/fs/cachefiles/proc.c
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+ unsigned long index;
+ unsigned x, y, z, t;
+
+ switch ((unsigned long) v) {
+ case 1:
+ seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n");
+ return 0;
+ case 2:
+ seq_puts(m, "===== ===== ========= ========= =========\n");
+ return 0;
+ default:
+ index = (unsigned long) v - 3;
+ x = atomic_read(&cachefiles_lookup_histogram[index]);
+ y = atomic_read(&cachefiles_mkdir_histogram[index]);
+ z = atomic_read(&cachefiles_create_histogram[index]);
+ if (x == 0 && y == 0 && z == 0)
+ return 0;
+
+ t = (index * 1000) / HZ;
+
+ seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z);
+ return 0;
+ }
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+ if ((unsigned long long)*_pos >= HZ + 2)
+ return NULL;
+ if (*_pos == 0)
+ *_pos = 1;
+ return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return (unsigned long long)*pos > HZ + 2 ?
+ NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations cachefiles_histogram_ops = {
+ .start = cachefiles_histogram_start,
+ .stop = cachefiles_histogram_stop,
+ .next = cachefiles_histogram_next,
+ .show = cachefiles_histogram_show,
+};
+
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &cachefiles_histogram_ops);
+}
+
+static const struct file_operations cachefiles_histogram_fops = {
+ .owner = THIS_MODULE,
+ .open = cachefiles_histogram_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+ _enter("");
+
+ if (!proc_mkdir("fs/cachefiles", NULL))
+ goto error_dir;
+
+ if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+ &cachefiles_histogram_fops))
+ goto error_histogram;
+
+ _leave(" = 0");
+ return 0;
+
+error_histogram:
+ remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+ remove_proc_entry("fs/cachefiles/histogram", NULL);
+ remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644
index 00000000000..a69787e7dd9
--- /dev/null
+++ b/fs/cachefiles/rdwr.c
@@ -0,0 +1,879 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "internal.h"
+
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+ int sync, void *_key)
+{
+ struct cachefiles_one_read *monitor =
+ container_of(wait, struct cachefiles_one_read, monitor);
+ struct cachefiles_object *object;
+ struct wait_bit_key *key = _key;
+ struct page *page = wait->private;
+
+ ASSERT(key);
+
+ _enter("{%lu},%u,%d,{%p,%u}",
+ monitor->netfs_page->index, mode, sync,
+ key->flags, key->bit_nr);
+
+ if (key->flags != &page->flags ||
+ key->bit_nr != PG_locked)
+ return 0;
+
+ _debug("--- monitor %p %lx ---", page, page->flags);
+
+ if (!PageUptodate(page) && !PageError(page))
+ dump_stack();
+
+ /* remove from the waitqueue */
+ list_del(&wait->task_list);
+
+ /* move onto the action list and queue for FS-Cache thread pool */
+ ASSERT(monitor->op);
+
+ object = container_of(monitor->op->op.object,
+ struct cachefiles_object, fscache);
+
+ spin_lock(&object->work_lock);
+ list_add_tail(&monitor->op_link, &monitor->op->to_do);
+ spin_unlock(&object->work_lock);
+
+ fscache_enqueue_retrieval(monitor->op);
+ return 0;
+}
+
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+ struct cachefiles_one_read *monitor;
+ struct cachefiles_object *object;
+ struct fscache_retrieval *op;
+ struct pagevec pagevec;
+ int error, max;
+
+ op = container_of(_op, struct fscache_retrieval, op);
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+
+ _enter("{ino=%lu}", object->backer->d_inode->i_ino);
+
+ pagevec_init(&pagevec, 0);
+
+ max = 8;
+ spin_lock_irq(&object->work_lock);
+
+ while (!list_empty(&op->to_do)) {
+ monitor = list_entry(op->to_do.next,
+ struct cachefiles_one_read, op_link);
+ list_del(&monitor->op_link);
+
+ spin_unlock_irq(&object->work_lock);
+
+ _debug("- copy {%lu}", monitor->back_page->index);
+
+ error = -EIO;
+ if (PageUptodate(monitor->back_page)) {
+ copy_highpage(monitor->netfs_page, monitor->back_page);
+
+ pagevec_add(&pagevec, monitor->netfs_page);
+ fscache_mark_pages_cached(monitor->op, &pagevec);
+ error = 0;
+ }
+
+ if (error)
+ cachefiles_io_error_obj(
+ object,
+ "Readpage failed on backing file %lx",
+ (unsigned long) monitor->back_page->flags);
+
+ page_cache_release(monitor->back_page);
+
+ fscache_end_io(op, monitor->netfs_page, error);
+ page_cache_release(monitor->netfs_page);
+ fscache_put_retrieval(op);
+ kfree(monitor);
+
+ /* let the thread pool have some air occasionally */
+ max--;
+ if (max < 0 || need_resched()) {
+ if (!list_empty(&op->to_do))
+ fscache_enqueue_retrieval(op);
+ _leave(" [maxed out]");
+ return;
+ }
+
+ spin_lock_irq(&object->work_lock);
+ }
+
+ spin_unlock_irq(&object->work_lock);
+ _leave("");
+}
+
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+ struct fscache_retrieval *op,
+ struct page *netpage,
+ struct pagevec *pagevec)
+{
+ struct cachefiles_one_read *monitor;
+ struct address_space *bmapping;
+ struct page *newpage, *backpage;
+ int ret;
+
+ _enter("");
+
+ pagevec_reinit(pagevec);
+
+ _debug("read back %p{%lu,%d}",
+ netpage, netpage->index, page_count(netpage));
+
+ monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+ if (!monitor)
+ goto nomem;
+
+ monitor->netfs_page = netpage;
+ monitor->op = fscache_get_retrieval(op);
+
+ init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+
+ /* attempt to get hold of the backing page */
+ bmapping = object->backer->d_inode->i_mapping;
+ newpage = NULL;
+
+ for (;;) {
+ backpage = find_get_page(bmapping, netpage->index);
+ if (backpage)
+ goto backing_page_already_present;
+
+ if (!newpage) {
+ newpage = page_cache_alloc_cold(bmapping);
+ if (!newpage)
+ goto nomem_monitor;
+ }
+
+ ret = add_to_page_cache(newpage, bmapping,
+ netpage->index, GFP_KERNEL);
+ if (ret == 0)
+ goto installed_new_backing_page;
+ if (ret != -EEXIST)
+ goto nomem_page;
+ }
+
+ /* we've installed a new backing page, so now we need to add it
+ * to the LRU list and start it reading */
+installed_new_backing_page:
+ _debug("- new %p", newpage);
+
+ backpage = newpage;
+ newpage = NULL;
+
+ page_cache_get(backpage);
+ pagevec_add(pagevec, backpage);
+ __pagevec_lru_add_file(pagevec);
+
+read_backing_page:
+ ret = bmapping->a_ops->readpage(NULL, backpage);
+ if (ret < 0)
+ goto read_error;
+
+ /* set the monitor to transfer the data across */
+monitor_backing_page:
+ _debug("- monitor add");
+
+ /* install the monitor */
+ page_cache_get(monitor->netfs_page);
+ page_cache_get(backpage);
+ monitor->back_page = backpage;
+ monitor->monitor.private = backpage;
+ add_page_wait_queue(backpage, &monitor->monitor);
+ monitor = NULL;
+
+ /* but the page may have been read before the monitor was installed, so
+ * the monitor may miss the event - so we have to ensure that we do get
+ * one in such a case */
+ if (trylock_page(backpage)) {
+ _debug("jumpstart %p {%lx}", backpage, backpage->flags);
+ unlock_page(backpage);
+ }
+ goto success;
+
+ /* if the backing page is already present, it can be in one of
+ * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+ _debug("- present");
+
+ if (newpage) {
+ page_cache_release(newpage);
+ newpage = NULL;
+ }
+
+ if (PageError(backpage))
+ goto io_error;
+
+ if (PageUptodate(backpage))
+ goto backing_page_already_uptodate;
+
+ if (!trylock_page(backpage))
+ goto monitor_backing_page;
+ _debug("read %p {%lx}", backpage, backpage->flags);
+ goto read_backing_page;
+
+ /* the backing page is already up to date, attach the netfs
+ * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+ _debug("- uptodate");
+
+ pagevec_add(pagevec, netpage);
+ fscache_mark_pages_cached(op, pagevec);
+
+ copy_highpage(netpage, backpage);
+ fscache_end_io(op, netpage, 0);
+
+success:
+ _debug("success");
+ ret = 0;
+
+out:
+ if (backpage)
+ page_cache_release(backpage);
+ if (monitor) {
+ fscache_put_retrieval(monitor->op);
+ kfree(monitor);
+ }
+ _leave(" = %d", ret);
+ return ret;
+
+read_error:
+ _debug("read error %d", ret);
+ if (ret == -ENOMEM)
+ goto out;
+io_error:
+ cachefiles_io_error_obj(object, "Page read error on backing file");
+ ret = -ENOBUFS;
+ goto out;
+
+nomem_page:
+ page_cache_release(newpage);
+nomem_monitor:
+ fscache_put_retrieval(monitor->op);
+ kfree(monitor);
+nomem:
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ * - a read will be started which will call the callback on completion
+ * - 0 will be returned
+ * - else if the page is unbacked:
+ * - the metadata will be retained
+ * - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+ struct page *page,
+ gfp_t gfp)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct pagevec pagevec;
+ struct inode *inode;
+ sector_t block0, block;
+ unsigned shift;
+ int ret;
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ _enter("{%p},{%lx},,,", object, page->index);
+
+ if (!object->backer)
+ return -ENOBUFS;
+
+ inode = object->backer->d_inode;
+ ASSERT(S_ISREG(inode->i_mode));
+ ASSERT(inode->i_mapping->a_ops->bmap);
+ ASSERT(inode->i_mapping->a_ops->readpages);
+
+ /* calculate the shift required to use bmap */
+ if (inode->i_sb->s_blocksize > PAGE_SIZE)
+ return -ENOBUFS;
+
+ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+ op->op.flags = FSCACHE_OP_FAST;
+ op->op.processor = cachefiles_read_copier;
+
+ pagevec_init(&pagevec, 0);
+
+ /* we assume the absence or presence of the first block is a good
+ * enough indication for the page as a whole
+ * - TODO: don't use bmap() for this as it is _not_ actually good
+ * enough for this as it doesn't indicate errors, but it's all we've
+ * got for the moment
+ */
+ block0 = page->index;
+ block0 <<= shift;
+
+ block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+ _debug("%llx -> %llx",
+ (unsigned long long) block0,
+ (unsigned long long) block);
+
+ if (block) {
+ /* submit the apparently valid page to the backing fs to be
+ * read from disk */
+ ret = cachefiles_read_backing_file_one(object, op, page,
+ &pagevec);
+ } else if (cachefiles_has_space(cache, 0, 1) == 0) {
+ /* there's space in the cache we can use */
+ pagevec_add(&pagevec, page);
+ fscache_mark_pages_cached(op, &pagevec);
+ ret = -ENODATA;
+ } else {
+ ret = -ENOBUFS;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+ struct fscache_retrieval *op,
+ struct list_head *list,
+ struct pagevec *mark_pvec)
+{
+ struct cachefiles_one_read *monitor = NULL;
+ struct address_space *bmapping = object->backer->d_inode->i_mapping;
+ struct pagevec lru_pvec;
+ struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+ int ret = 0;
+
+ _enter("");
+
+ pagevec_init(&lru_pvec, 0);
+
+ list_for_each_entry_safe(netpage, _n, list, lru) {
+ list_del(&netpage->lru);
+
+ _debug("read back %p{%lu,%d}",
+ netpage, netpage->index, page_count(netpage));
+
+ if (!monitor) {
+ monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+ if (!monitor)
+ goto nomem;
+
+ monitor->op = fscache_get_retrieval(op);
+ init_waitqueue_func_entry(&monitor->monitor,
+ cachefiles_read_waiter);
+ }
+
+ for (;;) {
+ backpage = find_get_page(bmapping, netpage->index);
+ if (backpage)
+ goto backing_page_already_present;
+
+ if (!newpage) {
+ newpage = page_cache_alloc_cold(bmapping);
+ if (!newpage)
+ goto nomem;
+ }
+
+ ret = add_to_page_cache(newpage, bmapping,
+ netpage->index, GFP_KERNEL);
+ if (ret == 0)
+ goto installed_new_backing_page;
+ if (ret != -EEXIST)
+ goto nomem;
+ }
+
+ /* we've installed a new backing page, so now we need to add it
+ * to the LRU list and start it reading */
+ installed_new_backing_page:
+ _debug("- new %p", newpage);
+
+ backpage = newpage;
+ newpage = NULL;
+
+ page_cache_get(backpage);
+ if (!pagevec_add(&lru_pvec, backpage))
+ __pagevec_lru_add_file(&lru_pvec);
+
+ reread_backing_page:
+ ret = bmapping->a_ops->readpage(NULL, backpage);
+ if (ret < 0)
+ goto read_error;
+
+ /* add the netfs page to the pagecache and LRU, and set the
+ * monitor to transfer the data across */
+ monitor_backing_page:
+ _debug("- monitor add");
+
+ ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+ GFP_KERNEL);
+ if (ret < 0) {
+ if (ret == -EEXIST) {
+ page_cache_release(netpage);
+ continue;
+ }
+ goto nomem;
+ }
+
+ page_cache_get(netpage);
+ if (!pagevec_add(&lru_pvec, netpage))
+ __pagevec_lru_add_file(&lru_pvec);
+
+ /* install a monitor */
+ page_cache_get(netpage);
+ monitor->netfs_page = netpage;
+
+ page_cache_get(backpage);
+ monitor->back_page = backpage;
+ monitor->monitor.private = backpage;
+ add_page_wait_queue(backpage, &monitor->monitor);
+ monitor = NULL;
+
+ /* but the page may have been read before the monitor was
+ * installed, so the monitor may miss the event - so we have to
+ * ensure that we do get one in such a case */
+ if (trylock_page(backpage)) {
+ _debug("2unlock %p {%lx}", backpage, backpage->flags);
+ unlock_page(backpage);
+ }
+
+ page_cache_release(backpage);
+ backpage = NULL;
+
+ page_cache_release(netpage);
+ netpage = NULL;
+ continue;
+
+ /* if the backing page is already present, it can be in one of
+ * three states: read in progress, read failed or read okay */
+ backing_page_already_present:
+ _debug("- present %p", backpage);
+
+ if (PageError(backpage))
+ goto io_error;
+
+ if (PageUptodate(backpage))
+ goto backing_page_already_uptodate;
+
+ _debug("- not ready %p{%lx}", backpage, backpage->flags);
+
+ if (!trylock_page(backpage))
+ goto monitor_backing_page;
+
+ if (PageError(backpage)) {
+ _debug("error %lx", backpage->flags);
+ unlock_page(backpage);
+ goto io_error;
+ }
+
+ if (PageUptodate(backpage))
+ goto backing_page_already_uptodate_unlock;
+
+ /* we've locked a page that's neither up to date nor erroneous,
+ * so we need to attempt to read it again */
+ goto reread_backing_page;
+
+ /* the backing page is already up to date, attach the netfs
+ * page to the pagecache and LRU and copy the data across */
+ backing_page_already_uptodate_unlock:
+ _debug("uptodate %lx", backpage->flags);
+ unlock_page(backpage);
+ backing_page_already_uptodate:
+ _debug("- uptodate");
+
+ ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+ GFP_KERNEL);
+ if (ret < 0) {
+ if (ret == -EEXIST) {
+ page_cache_release(netpage);
+ continue;
+ }
+ goto nomem;
+ }
+
+ copy_highpage(netpage, backpage);
+
+ page_cache_release(backpage);
+ backpage = NULL;
+
+ if (!pagevec_add(mark_pvec, netpage))
+ fscache_mark_pages_cached(op, mark_pvec);
+
+ page_cache_get(netpage);
+ if (!pagevec_add(&lru_pvec, netpage))
+ __pagevec_lru_add_file(&lru_pvec);
+
+ fscache_end_io(op, netpage, 0);
+ page_cache_release(netpage);
+ netpage = NULL;
+ continue;
+ }
+
+ netpage = NULL;
+
+ _debug("out");
+
+out:
+ /* tidy up */
+ pagevec_lru_add_file(&lru_pvec);
+
+ if (newpage)
+ page_cache_release(newpage);
+ if (netpage)
+ page_cache_release(netpage);
+ if (backpage)
+ page_cache_release(backpage);
+ if (monitor) {
+ fscache_put_retrieval(op);
+ kfree(monitor);
+ }
+
+ list_for_each_entry_safe(netpage, _n, list, lru) {
+ list_del(&netpage->lru);
+ page_cache_release(netpage);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+
+nomem:
+ _debug("nomem");
+ ret = -ENOMEM;
+ goto out;
+
+read_error:
+ _debug("read error %d", ret);
+ if (ret == -ENOMEM)
+ goto out;
+io_error:
+ cachefiles_io_error_obj(object, "Page read error on backing file");
+ ret = -ENOBUFS;
+ goto out;
+}
+
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+ struct list_head *pages,
+ unsigned *nr_pages,
+ gfp_t gfp)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct list_head backpages;
+ struct pagevec pagevec;
+ struct inode *inode;
+ struct page *page, *_n;
+ unsigned shift, nrbackpages;
+ int ret, ret2, space;
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ _enter("{OBJ%x,%d},,%d,,",
+ object->fscache.debug_id, atomic_read(&op->op.usage),
+ *nr_pages);
+
+ if (!object->backer)
+ return -ENOBUFS;
+
+ space = 1;
+ if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+ space = 0;
+
+ inode = object->backer->d_inode;
+ ASSERT(S_ISREG(inode->i_mode));
+ ASSERT(inode->i_mapping->a_ops->bmap);
+ ASSERT(inode->i_mapping->a_ops->readpages);
+
+ /* calculate the shift required to use bmap */
+ if (inode->i_sb->s_blocksize > PAGE_SIZE)
+ return -ENOBUFS;
+
+ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+ pagevec_init(&pagevec, 0);
+
+ op->op.flags = FSCACHE_OP_FAST;
+ op->op.processor = cachefiles_read_copier;
+
+ INIT_LIST_HEAD(&backpages);
+ nrbackpages = 0;
+
+ ret = space ? -ENODATA : -ENOBUFS;
+ list_for_each_entry_safe(page, _n, pages, lru) {
+ sector_t block0, block;
+
+ /* we assume the absence or presence of the first block is a
+ * good enough indication for the page as a whole
+ * - TODO: don't use bmap() for this as it is _not_ actually
+ * good enough for this as it doesn't indicate errors, but
+ * it's all we've got for the moment
+ */
+ block0 = page->index;
+ block0 <<= shift;
+
+ block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+ block0);
+ _debug("%llx -> %llx",
+ (unsigned long long) block0,
+ (unsigned long long) block);
+
+ if (block) {
+ /* we have data - add it to the list to give to the
+ * backing fs */
+ list_move(&page->lru, &backpages);
+ (*nr_pages)--;
+ nrbackpages++;
+ } else if (space && pagevec_add(&pagevec, page) == 0) {
+ fscache_mark_pages_cached(op, &pagevec);
+ ret = -ENODATA;
+ }
+ }
+
+ if (pagevec_count(&pagevec) > 0)
+ fscache_mark_pages_cached(op, &pagevec);
+
+ if (list_empty(pages))
+ ret = 0;
+
+ /* submit the apparently valid pages to the backing fs to be read from
+ * disk */
+ if (nrbackpages > 0) {
+ ret2 = cachefiles_read_backing_file(object, op, &backpages,
+ &pagevec);
+ if (ret2 == -ENOMEM || ret2 == -EINTR)
+ ret = ret2;
+ }
+
+ if (pagevec_count(&pagevec) > 0)
+ fscache_mark_pages_cached(op, &pagevec);
+
+ _leave(" = %d [nr=%u%s]",
+ ret, *nr_pages, list_empty(pages) ? " empty" : "");
+ return ret;
+}
+
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ * - the metadata will be retained
+ * - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+ struct page *page,
+ gfp_t gfp)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct pagevec pagevec;
+ int ret;
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ _enter("%p,{%lx},", object, page->index);
+
+ ret = cachefiles_has_space(cache, 0, 1);
+ if (ret == 0) {
+ pagevec_init(&pagevec, 0);
+ pagevec_add(&pagevec, page);
+ fscache_mark_pages_cached(op, &pagevec);
+ } else {
+ ret = -ENOBUFS;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ * - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+ struct list_head *pages,
+ unsigned *nr_pages,
+ gfp_t gfp)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct pagevec pagevec;
+ struct page *page;
+ int ret;
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ _enter("%p,,,%d,", object, *nr_pages);
+
+ ret = cachefiles_has_space(cache, 0, *nr_pages);
+ if (ret == 0) {
+ pagevec_init(&pagevec, 0);
+
+ list_for_each_entry(page, pages, lru) {
+ if (pagevec_add(&pagevec, page) == 0)
+ fscache_mark_pages_cached(op, &pagevec);
+ }
+
+ if (pagevec_count(&pagevec) > 0)
+ fscache_mark_pages_cached(op, &pagevec);
+ ret = -ENODATA;
+ } else {
+ ret = -ENOBUFS;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ * case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ mm_segment_t old_fs;
+ struct file *file;
+ loff_t pos;
+ void *data;
+ int ret;
+
+ ASSERT(op != NULL);
+ ASSERT(page != NULL);
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+
+ _enter("%p,%p{%lx},,,", object, page, page->index);
+
+ if (!object->backer) {
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+ }
+
+ ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ /* write the page to the backing filesystem and let it store it in its
+ * own time */
+ dget(object->backer);
+ mntget(cache->mnt);
+ file = dentry_open(object->backer, cache->mnt, O_RDWR,
+ cache->cache_cred);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ } else {
+ ret = -EIO;
+ if (file->f_op->write) {
+ pos = (loff_t) page->index << PAGE_SHIFT;
+ data = kmap(page);
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = file->f_op->write(
+ file, (const void __user *) data, PAGE_SIZE,
+ &pos);
+ set_fs(old_fs);
+ kunmap(page);
+ if (ret != PAGE_SIZE)
+ ret = -EIO;
+ }
+ fput(file);
+ }
+
+ if (ret < 0) {
+ if (ret == -EIO)
+ cachefiles_io_error_obj(
+ object, "Write page to backing file failed");
+ ret = -ENOBUFS;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+
+ object = container_of(_object, struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ _enter("%p,{%lu}", object, page->index);
+
+ spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644
index 00000000000..b5808cdb223
--- /dev/null
+++ b/fs/cachefiles/security.c
@@ -0,0 +1,116 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+ struct cred *new;
+ int ret;
+
+ _enter("{%s}", cache->secctx);
+
+ new = prepare_kernel_cred(current);
+ if (!new) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (cache->secctx) {
+ ret = set_security_override_from_ctx(new, cache->secctx);
+ if (ret < 0) {
+ put_cred(new);
+ printk(KERN_ERR "CacheFiles:"
+ " Security denies permission to nominate"
+ " security context: error %d\n",
+ ret);
+ goto error;
+ }
+ }
+
+ cache->cache_cred = new;
+ ret = 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+ struct dentry *root)
+{
+ int ret;
+
+ ret = security_inode_mkdir(root->d_inode, root, 0);
+ if (ret < 0) {
+ printk(KERN_ERR "CacheFiles:"
+ " Security denies permission to make dirs: error %d",
+ ret);
+ return ret;
+ }
+
+ ret = security_inode_create(root->d_inode, root, 0);
+ if (ret < 0)
+ printk(KERN_ERR "CacheFiles:"
+ " Security denies permission to create files: error %d",
+ ret);
+
+ return ret;
+}
+
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+ struct dentry *root,
+ const struct cred **_saved_cred)
+{
+ struct cred *new;
+ int ret;
+
+ _enter("");
+
+ /* duplicate the cache creds for COW (the override is currently in
+ * force, so we can use prepare_creds() to do this) */
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ cachefiles_end_secure(cache, *_saved_cred);
+
+ /* use the cache root dir's security context as the basis with
+ * which create files */
+ ret = set_create_files_as(new, root->d_inode);
+ if (ret < 0) {
+ _leave(" = %d [cfa]", ret);
+ return ret;
+ }
+
+ put_cred(cache->cache_cred);
+ cache->cache_cred = new;
+
+ cachefiles_begin_secure(cache, _saved_cred);
+ ret = cachefiles_check_cache_dir(cache, root);
+
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644
index 00000000000..f3e7a0bf068
--- /dev/null
+++ b/fs/cachefiles/xattr.c
@@ -0,0 +1,291 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include "internal.h"
+
+static const char cachefiles_xattr_cache[] =
+ XATTR_USER_PREFIX "CacheFiles.cache";
+
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+ struct dentry *dentry = object->dentry;
+ char type[3], xtype[3];
+ int ret;
+
+ ASSERT(dentry);
+ ASSERT(dentry->d_inode);
+
+ if (!object->fscache.cookie)
+ strcpy(type, "C3");
+ else
+ snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+
+ _enter("%p{%s}", object, type);
+
+ /* attempt to install a type label directly */
+ ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+ XATTR_CREATE);
+ if (ret == 0) {
+ _debug("SET"); /* we succeeded */
+ goto error;
+ }
+
+ if (ret != -EEXIST) {
+ kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+ dentry->d_name.len, dentry->d_name.len,
+ dentry->d_name.name, dentry->d_inode->i_ino,
+ -ret);
+ goto error;
+ }
+
+ /* read the current type label */
+ ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+ if (ret < 0) {
+ if (ret == -ERANGE)
+ goto bad_type_length;
+
+ kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+ dentry->d_name.len, dentry->d_name.len,
+ dentry->d_name.name, dentry->d_inode->i_ino,
+ -ret);
+ goto error;
+ }
+
+ /* check the type is what we're expecting */
+ if (ret != 2)
+ goto bad_type_length;
+
+ if (xtype[0] != type[0] || xtype[1] != type[1])
+ goto bad_type;
+
+ ret = 0;
+
+error:
+ _leave(" = %d", ret);
+ return ret;
+
+bad_type_length:
+ kerror("Cache object %lu type xattr length incorrect",
+ dentry->d_inode->i_ino);
+ ret = -EIO;
+ goto error;
+
+bad_type:
+ xtype[2] = 0;
+ kerror("Cache object %*.*s [%lu] type %s not %s",
+ dentry->d_name.len, dentry->d_name.len,
+ dentry->d_name.name, dentry->d_inode->i_ino,
+ xtype, type);
+ ret = -EIO;
+ goto error;
+}
+
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+ struct cachefiles_xattr *auxdata)
+{
+ struct dentry *dentry = object->dentry;
+ int ret;
+
+ ASSERT(object->fscache.cookie);
+ ASSERT(dentry);
+
+ _enter("%p,#%d", object, auxdata->len);
+
+ /* attempt to install the cache metadata directly */
+ _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+ ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+ &auxdata->type, auxdata->len,
+ XATTR_CREATE);
+ if (ret < 0 && ret != -ENOMEM)
+ cachefiles_io_error_obj(
+ object,
+ "Failed to set xattr with error %d", ret);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+ struct cachefiles_xattr *auxdata)
+{
+ struct dentry *dentry = object->dentry;
+ int ret;
+
+ ASSERT(object->fscache.cookie);
+ ASSERT(dentry);
+
+ _enter("%p,#%d", object, auxdata->len);
+
+ /* attempt to install the cache metadata directly */
+ _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+ ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+ &auxdata->type, auxdata->len,
+ XATTR_REPLACE);
+ if (ret < 0 && ret != -ENOMEM)
+ cachefiles_io_error_obj(
+ object,
+ "Failed to update xattr with error %d", ret);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+ struct cachefiles_xattr *auxdata)
+{
+ struct cachefiles_xattr *auxbuf;
+ struct dentry *dentry = object->dentry;
+ int ret;
+
+ _enter("%p,#%d", object, auxdata->len);
+
+ ASSERT(dentry);
+ ASSERT(dentry->d_inode);
+
+ auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+ if (!auxbuf) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ /* read the current type label */
+ ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+ &auxbuf->type, 512 + 1);
+ if (ret < 0) {
+ if (ret == -ENODATA)
+ goto stale; /* no attribute - power went off
+ * mid-cull? */
+
+ if (ret == -ERANGE)
+ goto bad_type_length;
+
+ cachefiles_io_error_obj(object,
+ "Can't read xattr on %lu (err %d)",
+ dentry->d_inode->i_ino, -ret);
+ goto error;
+ }
+
+ /* check the on-disk object */
+ if (ret < 1)
+ goto bad_type_length;
+
+ if (auxbuf->type != auxdata->type)
+ goto stale;
+
+ auxbuf->len = ret;
+
+ /* consult the netfs */
+ if (object->fscache.cookie->def->check_aux) {
+ enum fscache_checkaux result;
+ unsigned int dlen;
+
+ dlen = auxbuf->len - 1;
+
+ _debug("checkaux %s #%u",
+ object->fscache.cookie->def->name, dlen);
+
+ result = fscache_check_aux(&object->fscache,
+ &auxbuf->data, dlen);
+
+ switch (result) {
+ /* entry okay as is */
+ case FSCACHE_CHECKAUX_OKAY:
+ goto okay;
+
+ /* entry requires update */
+ case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+ break;
+
+ /* entry requires deletion */
+ case FSCACHE_CHECKAUX_OBSOLETE:
+ goto stale;
+
+ default:
+ BUG();
+ }
+
+ /* update the current label */
+ ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+ &auxdata->type, auxdata->len,
+ XATTR_REPLACE);
+ if (ret < 0) {
+ cachefiles_io_error_obj(object,
+ "Can't update xattr on %lu"
+ " (error %d)",
+ dentry->d_inode->i_ino, -ret);
+ goto error;
+ }
+ }
+
+okay:
+ ret = 0;
+
+error:
+ kfree(auxbuf);
+ _leave(" = %d", ret);
+ return ret;
+
+bad_type_length:
+ kerror("Cache object %lu xattr length incorrect",
+ dentry->d_inode->i_ino);
+ ret = -EIO;
+ goto error;
+
+stale:
+ ret = -ESTALE;
+ goto error;
+}
+
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+ struct dentry *dentry)
+{
+ int ret;
+
+ ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+ if (ret < 0) {
+ if (ret == -ENOENT || ret == -ENODATA)
+ ret = 0;
+ else if (ret != -ENOMEM)
+ cachefiles_io_error(cache,
+ "Can't remove xattr from %lu"
+ " (error %d)",
+ dentry->d_inode->i_ino, -ret);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8..54dce78fbb7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
return -ENOMEM;
}
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
if (oplockEnabled)
oplock = REQ_OPLOCK;
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
rc = -ENOMEM;
else if (pTcon->unix_ext) {
struct cifs_unix_set_info_args args = {
- .mode = mode & ~current->fs->umask,
+ .mode = mode & ~current_umask(),
.ctime = NO_CHANGE_64,
.atime = NO_CHANGE_64,
.mtime = NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc6080..f121a80fdd6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
goto mkdir_out;
}
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
mode, NULL /* netfid */, pInfo, &oplock,
full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
direntry->d_inode->i_nlink = 2;
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
/* must turn on setgid bit if parent dir has it */
if (inode->i_mode & S_ISGID)
mode |= S_ISGID;
diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5..1c859dae758 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
#include <linux/poll.h>
#include <linux/mm.h>
#include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -1195,16 +1196,12 @@ out:
return ret;
}
-asmlinkage ssize_t
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+static size_t compat_readv(struct file *file,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t *pos)
{
- struct file *file;
ssize_t ret = -EBADF;
- file = fget(fd);
- if (!file)
- return -EBADF;
-
if (!(file->f_mode & FMODE_READ))
goto out;
@@ -1212,25 +1209,56 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
goto out;
- ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
+ ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
out:
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
- fput(file);
return ret;
}
asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen)
+compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen)
{
struct file *file;
- ssize_t ret = -EBADF;
+ int fput_needed;
+ ssize_t ret;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
return -EBADF;
+ ret = compat_readv(file, vec, vlen, &file->f_pos);
+ fput_light(file, fput_needed);
+ return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, u32 pos_high, u32 pos_low)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ struct file *file;
+ int fput_needed;
+ ssize_t ret;
+
+ if (pos < 0)
+ return -EINVAL;
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+ ret = compat_readv(file, vec, vlen, &pos);
+ fput_light(file, fput_needed);
+ return ret;
+}
+
+static size_t compat_writev(struct file *file,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t *pos)
+{
+ ssize_t ret = -EBADF;
+
if (!(file->f_mode & FMODE_WRITE))
goto out;
@@ -1238,13 +1266,47 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
goto out;
- ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
+ ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
out:
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
- fput(file);
+ return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen)
+{
+ struct file *file;
+ int fput_needed;
+ ssize_t ret;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+ ret = compat_writev(file, vec, vlen, &file->f_pos);
+ fput_light(file, fput_needed);
+ return ret;
+}
+
+asmlinkage ssize_t
+compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
+ unsigned long vlen, u32 pos_high, u32 pos_low)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ struct file *file;
+ int fput_needed;
+ ssize_t ret;
+
+ if (pos < 0)
+ return -EINVAL;
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+ ret = compat_writev(file, vec, vlen, &pos);
+ fput_light(file, fput_needed);
return ret;
}
@@ -1441,12 +1503,15 @@ int compat_do_execve(char * filename,
bprm->cred = prepare_exec_creds();
if (!bprm->cred)
goto out_unlock;
- check_unsafe_exec(bprm);
+
+ retval = check_unsafe_exec(bprm);
+ if (retval)
+ goto out_unlock;
file = open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file))
- goto out_unlock;
+ goto out_unmark;
sched_exec();
@@ -1488,6 +1553,9 @@ int compat_do_execve(char * filename,
goto out;
/* execve succeeded */
+ write_lock(&current->fs->lock);
+ current->fs->in_exec = 0;
+ write_unlock(&current->fs->lock);
current->in_execve = 0;
mutex_unlock(&current->cred_exec_mutex);
acct_update_integrals(current);
@@ -1506,6 +1574,11 @@ out_file:
fput(bprm->file);
}
+out_unmark:
+ write_lock(&current->fs->lock);
+ current->fs->in_exec = 0;
+ write_unlock(&current->fs->lock);
+
out_unlock:
current->in_execve = 0;
mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ff786687e93..3e87ce443ea 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,7 @@
#include <linux/if.h>
#include <linux/if_bridge.h>
#include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
#include <linux/kd.h>
#include <linux/route.h>
#include <linux/in6.h>
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a07338d2d14..dd3634e4c96 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,6 +318,7 @@ out:
static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = CRAMFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
@@ -326,6 +327,8 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = 0;
buf->f_files = CRAMFS_SB(sb)->files;
buf->f_ffree = 0;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = CRAMFS_MAXPATHLEN;
return 0;
}
@@ -459,11 +462,14 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
static int cramfs_readpage(struct file *file, struct page * page)
{
struct inode *inode = page->mapping->host;
- u32 maxblock, bytes_filled;
+ u32 maxblock;
+ int bytes_filled;
void *pgdata;
maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
bytes_filled = 0;
+ pgdata = kmap(page);
+
if (page->index < maxblock) {
struct super_block *sb = inode->i_sb;
u32 blkptr_offset = OFFSET(inode) + page->index*4;
@@ -472,30 +478,43 @@ static int cramfs_readpage(struct file *file, struct page * page)
start_offset = OFFSET(inode) + maxblock*4;
mutex_lock(&read_mutex);
if (page->index)
- start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, 4);
- compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - start_offset);
+ start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
+ 4);
+ compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
+ start_offset);
mutex_unlock(&read_mutex);
- pgdata = kmap(page);
+
if (compr_len == 0)
; /* hole */
- else if (compr_len > (PAGE_CACHE_SIZE << 1))
- printk(KERN_ERR "cramfs: bad compressed blocksize %u\n", compr_len);
- else {
+ else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+ pr_err("cramfs: bad compressed blocksize %u\n",
+ compr_len);
+ goto err;
+ } else {
mutex_lock(&read_mutex);
bytes_filled = cramfs_uncompress_block(pgdata,
PAGE_CACHE_SIZE,
cramfs_read(sb, start_offset, compr_len),
compr_len);
mutex_unlock(&read_mutex);
+ if (unlikely(bytes_filled < 0))
+ goto err;
}
- } else
- pgdata = kmap(page);
+ }
+
memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
- kunmap(page);
flush_dcache_page(page);
+ kunmap(page);
SetPageUptodate(page);
unlock_page(page);
return 0;
+
+err:
+ kunmap(page);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ unlock_page(page);
+ return 0;
}
static const struct address_space_operations cramfs_aops = {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index fc3ccb74626..023329800d2 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -50,7 +50,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
err:
printk("Error %d while decompressing!\n", err);
printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
- return 0;
+ return -EIO;
}
int cramfs_uncompress_init(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b11..761d30be268 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,7 +17,6 @@
#include <linux/syscalls.h>
#include <linux/string.h>
#include <linux/mm.h>
-#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
@@ -32,6 +31,7 @@
#include <linux/seqlock.h>
#include <linux/swap.h>
#include <linux/bootmem.h>
+#include <linux/fs_struct.h>
#include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 44d725f612c..b6a719a909f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
continue;
if (inode->i_mapping->nrpages == 0)
continue;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e4a6223c314..af737bb56cb 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
out_release_free_unlock:
crypto_free_hash(s->hash_desc.tfm);
out_free_unlock:
- memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
- kfree(s->block_aligned_filename);
+ kzfree(s->block_aligned_filename);
out_unlock:
mutex_unlock(s->tfm_mutex);
out:
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e0..295e7fa5675 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
if (daemon->user_ns)
put_user_ns(daemon->user_ns);
mutex_unlock(&daemon->mux);
- memset(daemon, 0, sizeof(*daemon));
- kfree(daemon);
+ kzfree(daemon);
out:
return rc;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 73b19cfc91f..f0494281081 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -329,18 +329,22 @@ out_no_fs:
}
static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
- struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
+ struct super_block *sb = dentry->d_sb;
+ struct efs_sb_info *sbi = SUPER_INFO(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */
buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */
- buf->f_blocks = sb->total_groups * /* total data blocks */
- (sb->group_size - sb->inode_blocks);
- buf->f_bfree = sb->data_free; /* free data blocks */
- buf->f_bavail = sb->data_free; /* free blocks for non-root */
- buf->f_files = sb->total_groups * /* total inodes */
- sb->inode_blocks *
+ buf->f_blocks = sbi->total_groups * /* total data blocks */
+ (sbi->group_size - sbi->inode_blocks);
+ buf->f_bfree = sbi->data_free; /* free data blocks */
+ buf->f_bavail = sbi->data_free; /* free blocks for non-root */
+ buf->f_files = sbi->total_groups * /* total inodes */
+ sbi->inode_blocks *
(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
- buf->f_ffree = sb->inode_free; /* free inodes */
+ buf->f_ffree = sbi->inode_free; /* free inodes */
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa..2a701d593d3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
* issue a wakeup.
*/
__u64 count;
+ unsigned int flags;
};
/*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
n = (int) (ULLONG_MAX - ctx->count);
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked(&ctx->wqh);
+ wake_up_locked_poll(&ctx->wqh, POLLIN);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
{
struct eventfd_ctx *ctx = file->private_data;
ssize_t res;
- __u64 ucnt;
+ __u64 ucnt = 0;
DECLARE_WAITQUEUE(wait, current);
if (count < sizeof(ucnt))
return -EINVAL;
spin_lock_irq(&ctx->wqh.lock);
res = -EAGAIN;
- ucnt = ctx->count;
- if (ucnt > 0)
+ if (ctx->count > 0)
res = sizeof(ucnt);
else if (!(file->f_flags & O_NONBLOCK)) {
__add_wait_queue(&ctx->wqh, &wait);
for (res = 0;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (ctx->count > 0) {
- ucnt = ctx->count;
res = sizeof(ucnt);
break;
}
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
__remove_wait_queue(&ctx->wqh, &wait);
__set_current_state(TASK_RUNNING);
}
- if (res > 0) {
- ctx->count = 0;
+ if (likely(res > 0)) {
+ ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+ ctx->count -= ucnt;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked(&ctx->wqh);
+ wake_up_locked_poll(&ctx->wqh, POLLOUT);
}
spin_unlock_irq(&ctx->wqh.lock);
if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
__remove_wait_queue(&ctx->wqh, &wait);
__set_current_state(TASK_RUNNING);
}
- if (res > 0) {
+ if (likely(res > 0)) {
ctx->count += ucnt;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked(&ctx->wqh);
+ wake_up_locked_poll(&ctx->wqh, POLLIN);
}
spin_unlock_irq(&ctx->wqh.lock);
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
- if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+ if (flags & ~EFD_FLAGS_SET)
return -EINVAL;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
+ ctx->flags = flags;
/*
* When we call this, the initialization must be complete, since
* anon_inode_getfd() will install the fd.
*/
fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
- flags & (O_CLOEXEC | O_NONBLOCK));
+ flags & EFD_SHARED_FCNTL_FLAGS);
if (fd < 0)
kfree(ctx);
return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
{
return sys_eventfd2(count, 0);
}
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd..a89f370fadb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
/*
- * fs/eventpoll.c (Efficent event polling implementation)
- * Copyright (C) 2001,...,2007 Davide Libenzi
+ * fs/eventpoll.c (Efficient event retrieval implementation)
+ * Copyright (C) 2001,...,2009 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
* a better scalability.
*/
-#define DEBUG_EPOLL 0
-
-#if DEBUG_EPOLL > 0
-#define DPRINTK(x) printk x
-#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
-#else /* #if DEBUG_EPOLL > 0 */
-#define DPRINTK(x) (void) 0
-#define DNPRINTK(n, x) (void) 0
-#endif /* #if DEBUG_EPOLL > 0 */
-
-#define DEBUG_EPI 0
-
-#if DEBUG_EPI != 0
-#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
-#else /* #if DEBUG_EPI != 0 */
-#define EPI_SLAB_DEBUG 0
-#endif /* #if DEBUG_EPI != 0 */
-
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
-/* Maximum number of poll wake up nests we are allowing */
-#define EP_MAX_POLLWAKE_NESTS 4
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
/* Maximum msec timeout value storeable in a long int */
#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
};
/*
- * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
- * It is used to keep track on all tasks that are currently inside the wake_up() code
- * to 1) short-circuit the one coming from the same task and same wait queue head
- * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
- * 3) let go the ones coming from other tasks.
+ * Structure used to track possible nested calls, for too deep recursions
+ * and loop cycles.
*/
-struct wake_task_node {
+struct nested_call_node {
struct list_head llink;
- struct task_struct *task;
- wait_queue_head_t *wq;
+ void *cookie;
+ int cpu;
};
/*
- * This is used to implement the safe poll wake up avoiding to reenter
- * the poll callback from inside wake_up().
+ * This structure is used as collector for nested calls, to check for
+ * maximum recursion dept and loop cycles.
*/
-struct poll_safewake {
- struct list_head wake_task_list;
+struct nested_calls {
+ struct list_head tasks_call_list;
spinlock_t lock;
};
@@ -213,7 +192,7 @@ struct eppoll_entry {
struct list_head llink;
/* The "base" pointer is set to the container "struct epitem" */
- void *base;
+ struct epitem *base;
/*
* Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
struct epitem *epi;
};
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+ int maxevents;
+ struct epoll_event __user *events;
+};
+
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);
-/* Safe wake up implementation */
-static struct poll_safewake psw;
+/* Used for safe wake up implementation */
+static struct nested_calls poll_safewake_ncalls;
+
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
}
/* Initialize the poll safe wake up structure */
-static void ep_poll_safewake_init(struct poll_safewake *psw)
+static void ep_nested_calls_init(struct nested_calls *ncalls)
{
-
- INIT_LIST_HEAD(&psw->wake_task_list);
- spin_lock_init(&psw->lock);
+ INIT_LIST_HEAD(&ncalls->tasks_call_list);
+ spin_lock_init(&ncalls->lock);
}
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
- * because this one gets called by the poll callback, that in turn is called
- * from inside a wake_up(), that might be called from irq context.
+/**
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
+ * that the recursion limit is not exceeded, and that
+ * the same nested call (by the meaning of same cookie) is
+ * no re-entered.
+ *
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
+ * @max_nests: Maximum number of allowed nesting calls.
+ * @nproc: Nested call core function pointer.
+ * @priv: Opaque data to be passed to the @nproc callback.
+ * @cookie: Cookie to be used to identify this nested call.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ * the maximum recursion limit has been exceeded.
*/
-static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+ int (*nproc)(void *, void *, int), void *priv,
+ void *cookie)
{
- int wake_nests = 0;
+ int error, call_nests = 0;
unsigned long flags;
- struct task_struct *this_task = current;
- struct list_head *lsthead = &psw->wake_task_list;
- struct wake_task_node *tncur;
- struct wake_task_node tnode;
+ int this_cpu = get_cpu();
+ struct list_head *lsthead = &ncalls->tasks_call_list;
+ struct nested_call_node *tncur;
+ struct nested_call_node tnode;
- spin_lock_irqsave(&psw->lock, flags);
+ spin_lock_irqsave(&ncalls->lock, flags);
- /* Try to see if the current task is already inside this wakeup call */
+ /*
+ * Try to see if the current task is already inside this wakeup call.
+ * We use a list here, since the population inside this set is always
+ * very much limited.
+ */
list_for_each_entry(tncur, lsthead, llink) {
-
- if (tncur->wq == wq ||
- (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
+ if (tncur->cpu == this_cpu &&
+ (tncur->cookie == cookie || ++call_nests > max_nests)) {
/*
* Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself.
*/
- spin_unlock_irqrestore(&psw->lock, flags);
- return;
+ error = -1;
+ goto out_unlock;
}
}
- /* Add the current task to the list */
- tnode.task = this_task;
- tnode.wq = wq;
+ /* Add the current task and cookie to the list */
+ tnode.cpu = this_cpu;
+ tnode.cookie = cookie;
list_add(&tnode.llink, lsthead);
- spin_unlock_irqrestore(&psw->lock, flags);
+ spin_unlock_irqrestore(&ncalls->lock, flags);
- /* Do really wake up now */
- wake_up_nested(wq, 1 + wake_nests);
+ /* Call the nested function */
+ error = (*nproc)(priv, cookie, call_nests);
/* Remove the current task from the list */
- spin_lock_irqsave(&psw->lock, flags);
+ spin_lock_irqsave(&ncalls->lock, flags);
list_del(&tnode.llink);
- spin_unlock_irqrestore(&psw->lock, flags);
+ out_unlock:
+ spin_unlock_irqrestore(&ncalls->lock, flags);
+
+ put_cpu();
+ return error;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+ unsigned long events, int subclass)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+ wake_up_locked_poll(wqueue, events);
+ spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+ unsigned long events, int subclass)
+{
+ wake_up_poll(wqueue, events);
+}
+#endif
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+ ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+ 1 + call_nests);
+ return 0;
+}
+
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+ ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+ ep_poll_wakeup_proc, NULL, wq);
}
/*
- * This function unregister poll callbacks from the associated file descriptor.
- * Since this must be called without holding "ep->lock" the atomic exchange trick
- * will protect us from multiple unregister.
+ * This function unregisters poll callbacks from the associated file
+ * descriptor. Must be called with "mtx" held (or "epmutex" if called from
+ * ep_free).
*/
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
- int nwait;
struct list_head *lsthead = &epi->pwqlist;
struct eppoll_entry *pwq;
- /* This is called without locks, so we need the atomic exchange */
- nwait = xchg(&epi->nwait, 0);
+ while (!list_empty(lsthead)) {
+ pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
- if (nwait) {
- while (!list_empty(lsthead)) {
- pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+ list_del(&pwq->llink);
+ remove_wait_queue(pwq->whead, &pwq->wait);
+ kmem_cache_free(pwq_cache, pwq);
+ }
+}
- list_del_init(&pwq->llink);
- remove_wait_queue(pwq->whead, &pwq->wait);
- kmem_cache_free(pwq_cache, pwq);
- }
+/**
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
+ * the scan code, to call f_op->poll(). Also allows for
+ * O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+ int (*sproc)(struct eventpoll *,
+ struct list_head *, void *),
+ void *priv)
+{
+ int error, pwake = 0;
+ unsigned long flags;
+ struct epitem *epi, *nepi;
+ LIST_HEAD(txlist);
+
+ /*
+ * We need to lock this because we could be hit by
+ * eventpoll_release_file() and epoll_ctl().
+ */
+ mutex_lock(&ep->mtx);
+
+ /*
+ * Steal the ready list, and re-init the original one to the
+ * empty list. Also, set ep->ovflist to NULL so that events
+ * happening while looping w/out locks, are not lost. We cannot
+ * have the poll callback to queue directly on ep->rdllist,
+ * because we want the "sproc" callback to be able to do it
+ * in a lockless way.
+ */
+ spin_lock_irqsave(&ep->lock, flags);
+ list_splice_init(&ep->rdllist, &txlist);
+ ep->ovflist = NULL;
+ spin_unlock_irqrestore(&ep->lock, flags);
+
+ /*
+ * Now call the callback function.
+ */
+ error = (*sproc)(ep, &txlist, priv);
+
+ spin_lock_irqsave(&ep->lock, flags);
+ /*
+ * During the time we spent inside the "sproc" callback, some
+ * other events might have been queued by the poll callback.
+ * We re-insert them inside the main ready-list here.
+ */
+ for (nepi = ep->ovflist; (epi = nepi) != NULL;
+ nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+ /*
+ * We need to check if the item is already in the list.
+ * During the "sproc" callback execution time, items are
+ * queued into ->ovflist but the "txlist" might already
+ * contain them, and the list_splice() below takes care of them.
+ */
+ if (!ep_is_linked(&epi->rdllink))
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ }
+ /*
+ * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+ * releasing the lock, events will be queued in the normal way inside
+ * ep->rdllist.
+ */
+ ep->ovflist = EP_UNACTIVE_PTR;
+
+ /*
+ * Quickly re-inject items left on "txlist".
+ */
+ list_splice(&txlist, &ep->rdllist);
+
+ if (!list_empty(&ep->rdllist)) {
+ /*
+ * Wake up (if active) both the eventpoll wait list and
+ * the ->poll() wait list (delayed after we release the lock).
+ */
+ if (waitqueue_active(&ep->wq))
+ wake_up_locked(&ep->wq);
+ if (waitqueue_active(&ep->poll_wait))
+ pwake++;
}
+ spin_unlock_irqrestore(&ep->lock, flags);
+
+ mutex_unlock(&ep->mtx);
+
+ /* We have to call this outside the lock */
+ if (pwake)
+ ep_poll_safewake(&ep->poll_wait);
+
+ return error;
}
/*
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
atomic_dec(&ep->user->epoll_watches);
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
- current, ep, file));
-
return 0;
}
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(&psw, &ep->poll_wait);
+ ep_poll_safewake(&ep->poll_wait);
/*
* We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
if (ep)
ep_free(ep);
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
return 0;
}
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+ void *priv)
+{
+ struct epitem *epi, *tmp;
+
+ list_for_each_entry_safe(epi, tmp, head, rdllink) {
+ if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ epi->event.events)
+ return POLLIN | POLLRDNORM;
+ else {
+ /*
+ * Item has been dropped into the ready list by the poll
+ * callback, but it's not actually ready, as far as
+ * caller requested events goes. We can remove it here.
+ */
+ list_del_init(&epi->rdllink);
+ }
+ }
+
+ return 0;
+}
+
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+ return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+}
+
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
- unsigned int pollflags = 0;
- unsigned long flags;
+ int pollflags;
struct eventpoll *ep = file->private_data;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
- /* Check our condition */
- spin_lock_irqsave(&ep->lock, flags);
- if (!list_empty(&ep->rdllist))
- pollflags = POLLIN | POLLRDNORM;
- spin_unlock_irqrestore(&ep->lock, flags);
+ /*
+ * Proceed to find out if wanted events are really available inside
+ * the ready list. This need to be done under ep_call_nested()
+ * supervision, since the call to f_op->poll() done on listed files
+ * could re-enter here.
+ */
+ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+ ep_poll_readyevents_proc, ep, ep);
- return pollflags;
+ return pollflags != -1 ? pollflags : 0;
}
/* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file)
* We don't want to get "file->f_lock" because it is not
* necessary. It is not necessary because we're in the "struct file"
* cleanup path, and this means that noone is using this file anymore.
- * So, for example, epoll_ctl() cannot hit here sicne if we reach this
+ * So, for example, epoll_ctl() cannot hit here since if we reach this
* point, the file counter already went to zero and fget() would fail.
* The only hit might come from ep_free() but by holding the mutex
* will correctly serialize the operation. We do need to acquire
@@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
*pep = ep;
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
- current, ep));
return 0;
free_uid:
@@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
}
}
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
- current, file, epir));
-
return epir;
}
@@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
- current, epi->ffd.file, epi, ep));
-
spin_lock_irqsave(&ep->lock, flags);
/*
@@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
goto out_unlock;
/*
+ * Check the events coming with the callback. At this stage, not
+ * every device reports the events in the "key" parameter of the
+ * callback. We need to be able to handle both cases here, hence the
+ * test for "key" != NULL before the event match test.
+ */
+ if (key && !((unsigned long) key & epi->event.events))
+ goto out_unlock;
+
+ /*
* If we are trasfering events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happens during that period of time are
@@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
}
/* If this file is already in the ready list we exit soon */
- if (ep_is_linked(&epi->rdllink))
- goto is_linked;
-
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(&epi->rdllink))
+ list_add_tail(&epi->rdllink, &ep->rdllist);
-is_linked:
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
@@ -690,7 +842,7 @@ out_unlock:
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&psw, &ep->poll_wait);
+ ep_poll_safewake(&ep->poll_wait);
return 1;
}
@@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&psw, &ep->poll_wait);
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
- current, ep, tfile, fd));
+ ep_poll_safewake(&ep->poll_wait);
return 0;
@@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
{
int pwake = 0;
unsigned int revents;
- unsigned long flags;
/*
- * Set the new event interest mask before calling f_op->poll(), otherwise
- * a potential race might occur. In fact if we do this operation inside
- * the lock, an event might happen between the f_op->poll() call and the
- * new event set registering.
+ * Set the new event interest mask before calling f_op->poll();
+ * otherwise we might miss an event that happens between the
+ * f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events;
+ epi->event.data = event->data; /* protected by mtx */
/*
* Get current event bits. We can safely use the file* here because
@@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
*/
revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
- spin_lock_irqsave(&ep->lock, flags);
-
- /* Copy the data member from inside the lock */
- epi->event.data = event->data;
-
/*
* If the item is "hot" and it is not registered inside the ready
* list, push it inside.
*/
if (revents & event->events) {
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
+ spin_unlock_irq(&ep->lock);
}
- spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&psw, &ep->poll_wait);
+ ep_poll_safewake(&ep->poll_wait);
return 0;
}
-static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents)
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+ void *priv)
{
- int eventcnt, error = -EFAULT, pwake = 0;
+ struct ep_send_events_data *esed = priv;
+ int eventcnt;
unsigned int revents;
- unsigned long flags;
- struct epitem *epi, *nepi;
- struct list_head txlist;
-
- INIT_LIST_HEAD(&txlist);
-
- /*
- * We need to lock this because we could be hit by
- * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
- */
- mutex_lock(&ep->mtx);
-
- /*
- * Steal the ready list, and re-init the original one to the
- * empty list. Also, set ep->ovflist to NULL so that events
- * happening while looping w/out locks, are not lost. We cannot
- * have the poll callback to queue directly on ep->rdllist,
- * because we are doing it in the loop below, in a lockless way.
- */
- spin_lock_irqsave(&ep->lock, flags);
- list_splice(&ep->rdllist, &txlist);
- INIT_LIST_HEAD(&ep->rdllist);
- ep->ovflist = NULL;
- spin_unlock_irqrestore(&ep->lock, flags);
+ struct epitem *epi;
+ struct epoll_event __user *uevent;
/*
- * We can loop without lock because this is a task private list.
- * We just splice'd out the ep->rdllist in ep_collect_ready_items().
- * Items cannot vanish during the loop because we are holding "mtx".
+ * We can loop without lock because we are passed a task private list.
+ * Items cannot vanish during the loop because ep_scan_ready_list() is
+ * holding "mtx" during this call.
*/
- for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
- epi = list_first_entry(&txlist, struct epitem, rdllink);
+ for (eventcnt = 0, uevent = esed->events;
+ !list_empty(head) && eventcnt < esed->maxevents;) {
+ epi = list_first_entry(head, struct epitem, rdllink);
list_del_init(&epi->rdllink);
- /*
- * Get the ready file event set. We can safely use the file
- * because we are holding the "mtx" and this will guarantee
- * that both the file and the item will not vanish.
- */
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
- revents &= epi->event.events;
+ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ epi->event.events;
/*
- * Is the event mask intersect the caller-requested one,
- * deliver the event to userspace. Again, we are holding
- * "mtx", so no operations coming from userspace can change
- * the item.
+ * If the event mask intersect the caller-requested one,
+ * deliver the event to userspace. Again, ep_scan_ready_list()
+ * is holding "mtx", so no operations coming from userspace
+ * can change the item.
*/
if (revents) {
- if (__put_user(revents,
- &events[eventcnt].events) ||
- __put_user(epi->event.data,
- &events[eventcnt].data))
- goto errxit;
+ if (__put_user(revents, &uevent->events) ||
+ __put_user(epi->event.data, &uevent->data)) {
+ list_add(&epi->rdllink, head);
+ return eventcnt ? eventcnt : -EFAULT;
+ }
+ eventcnt++;
+ uevent++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
- eventcnt++;
+ else if (!(epi->event.events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, noone can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ }
}
- /*
- * At this point, noone can insert into ep->rdllist besides
- * us. The epoll_ctl() callers are locked out by us holding
- * "mtx" and the poll callback will queue them in ep->ovflist.
- */
- if (!(epi->event.events & EPOLLET) &&
- (revents & epi->event.events))
- list_add_tail(&epi->rdllink, &ep->rdllist);
- }
- error = 0;
-
-errxit:
-
- spin_lock_irqsave(&ep->lock, flags);
- /*
- * During the time we spent in the loop above, some other events
- * might have been queued by the poll callback. We re-insert them
- * inside the main ready-list here.
- */
- for (nepi = ep->ovflist; (epi = nepi) != NULL;
- nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
- /*
- * If the above loop quit with errors, the epoll item might still
- * be linked to "txlist", and the list_splice() done below will
- * take care of those cases.
- */
- if (!ep_is_linked(&epi->rdllink))
- list_add_tail(&epi->rdllink, &ep->rdllist);
}
- /*
- * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
- * releasing the lock, events will be queued in the normal way inside
- * ep->rdllist.
- */
- ep->ovflist = EP_UNACTIVE_PTR;
- /*
- * In case of error in the event-send loop, or in case the number of
- * ready events exceeds the userspace limit, we need to splice the
- * "txlist" back inside ep->rdllist.
- */
- list_splice(&txlist, &ep->rdllist);
-
- if (!list_empty(&ep->rdllist)) {
- /*
- * Wake up (if active) both the eventpoll wait list and the ->poll()
- * wait list (delayed after we release the lock).
- */
- if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
- }
- spin_unlock_irqrestore(&ep->lock, flags);
+ return eventcnt;
+}
- mutex_unlock(&ep->mtx);
+static int ep_send_events(struct eventpoll *ep,
+ struct epoll_event __user *events, int maxevents)
+{
+ struct ep_send_events_data esed;
- /* We have to call this outside the lock */
- if (pwake)
- ep_poll_safewake(&psw, &ep->poll_wait);
+ esed.maxevents = maxevents;
+ esed.events = events;
- return eventcnt == 0 ? error: eventcnt;
+ return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
}
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
wait_queue_t wait;
/*
- * Calculate the timeout by checking for the "infinite" value ( -1 )
+ * Calculate the timeout by checking for the "infinite" value (-1)
* and the overflow condition. The passed timeout is in milliseconds,
* that why (t * HZ) / 1000.
*/
@@ -1076,9 +1162,8 @@ retry:
set_current_state(TASK_RUNNING);
}
-
/* Is it worth to try to dig for events ? */
- eavail = !list_empty(&ep->rdllist);
+ eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
spin_unlock_irqrestore(&ep->lock, flags);
@@ -1099,41 +1184,30 @@ retry:
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
- int error, fd = -1;
- struct eventpoll *ep;
+ int error;
+ struct eventpoll *ep = NULL;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
-
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
- current, flags));
-
/*
- * Create the internal data structure ( "struct eventpoll" ).
+ * Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
- if (error < 0) {
- fd = error;
- goto error_return;
- }
-
+ if (error < 0)
+ return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
- flags & O_CLOEXEC);
- if (fd < 0)
+ error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+ flags & O_CLOEXEC);
+ if (error < 0)
ep_free(ep);
-error_return:
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
- current, flags, fd));
-
- return fd;
+ return error;
}
SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epitem *epi;
struct epoll_event epds;
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
- current, epfd, op, fd, event));
-
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
-
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
@@ -1237,8 +1307,6 @@ error_tgt_fput:
error_fput:
fput(file);
error_return:
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
- current, epfd, op, fd, event, error));
return error;
}
@@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
struct file *file;
struct eventpoll *ep;
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
- current, epfd, events, maxevents, timeout));
-
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
@@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
error_fput:
fput(file);
error_return:
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
- current, epfd, events, maxevents, timeout, error));
return error;
}
@@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void)
EP_ITEM_COST;
/* Initialize the structure used to perform safe poll wait head wake ups */
- ep_poll_safewake_init(&psw);
+ ep_nested_calls_init(&poll_safewake_ncalls);
+
+ /* Initialize the structure used to perform file's f_op->poll() calls */
+ ep_nested_calls_init(&poll_readywalk_ncalls);
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
- 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
- NULL);
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
- sizeof(struct eppoll_entry), 0,
- EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
+ sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc916..052a961e41a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -1056,28 +1057,35 @@ EXPORT_SYMBOL(install_exec_creds);
* - the caller must hold current->cred_exec_mutex to protect against
* PTRACE_ATTACH
*/
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned long flags;
- unsigned n_fs, n_sighand;
+ unsigned n_fs;
+ int res = 0;
bprm->unsafe = tracehook_unsafe_exec(p);
n_fs = 1;
- n_sighand = 1;
+ write_lock(&p->fs->lock);
lock_task_sighand(p, &flags);
for (t = next_thread(p); t != p; t = next_thread(t)) {
if (t->fs == p->fs)
n_fs++;
- n_sighand++;
}
- if (atomic_read(&p->fs->count) > n_fs ||
- atomic_read(&p->sighand->count) > n_sighand)
+ if (p->fs->users > n_fs) {
bprm->unsafe |= LSM_UNSAFE_SHARE;
+ } else {
+ if (p->fs->in_exec)
+ res = -EAGAIN;
+ p->fs->in_exec = 1;
+ }
unlock_task_sighand(p, &flags);
+ write_unlock(&p->fs->lock);
+
+ return res;
}
/*
@@ -1296,12 +1304,15 @@ int do_execve(char * filename,
bprm->cred = prepare_exec_creds();
if (!bprm->cred)
goto out_unlock;
- check_unsafe_exec(bprm);
+
+ retval = check_unsafe_exec(bprm);
+ if (retval)
+ goto out_unlock;
file = open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file))
- goto out_unlock;
+ goto out_unmark;
sched_exec();
@@ -1344,6 +1355,9 @@ int do_execve(char * filename,
goto out;
/* execve succeeded */
+ write_lock(&current->fs->lock);
+ current->fs->in_exec = 0;
+ write_unlock(&current->fs->lock);
current->in_execve = 0;
mutex_unlock(&current->cred_exec_mutex);
acct_update_integrals(current);
@@ -1362,6 +1376,11 @@ out_file:
fput(bprm->file);
}
+out_unmark:
+ write_lock(&current->fs->lock);
+ current->fs->in_exec = 0;
+ write_unlock(&current->fs->lock);
+
out_unlock:
current->in_execve = 0;
mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 00000000000..1b2d4c63a57
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+ were written, but the inode attributes failed. Then if the filesystem was
+ unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 00000000000..cc2d22db119
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc. All rights reserved.
+#
+# Authors:
+# Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 00000000000..86194b2f799
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+ tristate "exofs: OSD based file system support"
+ depends on SCSI_OSD_ULD
+ help
+ EXOFS is a file system that uses an OSD storage device,
+ as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+ bool "Enable debugging"
+ depends on EXOFS_FS
+ help
+ This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 00000000000..b1512c4bb8c
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
+#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
+#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA 1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number. This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+ EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+ (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+ EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT 12
+#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC 0x5DF5
+
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
+ * on disk. Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+ __le64 s_nextid; /* Highest object ID used */
+ __le32 s_numfiles; /* Number of files on fs */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_newfs; /* Non-zero if this is a new fs */
+};
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA 5
+
+/*
+ * The file control block - stored in an object's attributes. This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+ __le64 i_size; /* Size of the file */
+ __le16 i_mode; /* File mode */
+ __le16 i_links_count; /* Links count */
+ __le32 i_uid; /* Owner Uid */
+ __le32 i_gid; /* Group Id */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Creation time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_flags; /* File flags (unused for now)*/
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+ EXOFS_APAGE_FS_DATA,
+ EXOFS_ATTR_INODE_DATA,
+ EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN 255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+ __le64 inode_no; /* inode number */
+ __le16 rec_len; /* directory entry length */
+ u8 name_len; /* name length */
+ u8 file_type; /* umm...file type */
+ char name[EXOFS_NAME_LEN]; /* file name */
+};
+
+enum {
+ EXOFS_FT_UNKNOWN,
+ EXOFS_FT_REG_FILE,
+ EXOFS_FT_DIR,
+ EXOFS_FT_CHRDEV,
+ EXOFS_FT_BLKDEV,
+ EXOFS_FT_FIFO,
+ EXOFS_FT_SOCK,
+ EXOFS_FT_SYMLINK,
+ EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD 4
+#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+ (((name_len) + offsetof(struct exofs_dir_entry, name) + \
+ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+ const struct osd_obj_id *obj);
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+ return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+ osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+
+int osd_req_read_kern(struct osd_request *or,
+ const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+int osd_req_write_kern(struct osd_request *or,
+ const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 00000000000..65b0c8c776a
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+ return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+ kunmap(page);
+ page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+ return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+ loff_t last_byte = inode->i_size;
+
+ last_byte -= page_nr << PAGE_CACHE_SHIFT;
+ if (last_byte > PAGE_CACHE_SIZE)
+ last_byte = PAGE_CACHE_SIZE;
+ return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *dir = mapping->host;
+ int err = 0;
+
+ dir->i_version++;
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ if (pos+len > dir->i_size) {
+ i_size_write(dir, pos+len);
+ mark_inode_dirty(dir);
+ }
+ set_page_dirty(page);
+
+ if (IS_DIRSYNC(dir))
+ err = write_one_page(page, 1);
+ else
+ unlock_page(page);
+
+ return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+ struct inode *dir = page->mapping->host;
+ unsigned chunk_size = exofs_chunk_size(dir);
+ char *kaddr = page_address(page);
+ unsigned offs, rec_len;
+ unsigned limit = PAGE_CACHE_SIZE;
+ struct exofs_dir_entry *p;
+ char *error;
+
+ /* if the page is the last one in the directory */
+ if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if (limit & (chunk_size - 1))
+ goto Ebadsize;
+ if (!limit)
+ goto out;
+ }
+ for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+ p = (struct exofs_dir_entry *)(kaddr + offs);
+ rec_len = le16_to_cpu(p->rec_len);
+
+ if (rec_len < EXOFS_DIR_REC_LEN(1))
+ goto Eshort;
+ if (rec_len & 3)
+ goto Ealign;
+ if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+ goto Enamelen;
+ if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+ goto Espan;
+ }
+ if (offs != limit)
+ goto Eend;
+out:
+ SetPageChecked(page);
+ return;
+
+Ebadsize:
+ EXOFS_ERR("ERROR [exofs_check_page]: "
+ "size of directory #%lu is not a multiple of chunk size",
+ dir->i_ino
+ );
+ goto fail;
+Eshort:
+ error = "rec_len is smaller than minimal";
+ goto bad_entry;
+Ealign:
+ error = "unaligned directory entry";
+ goto bad_entry;
+Enamelen:
+ error = "rec_len is too small for name_len";
+ goto bad_entry;
+Espan:
+ error = "directory entry across blocks";
+ goto bad_entry;
+bad_entry:
+ EXOFS_ERR(
+ "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+ "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ _LLU(le64_to_cpu(p->inode_no)),
+ rec_len, p->name_len);
+ goto fail;
+Eend:
+ p = (struct exofs_dir_entry *)(kaddr + offs);
+ EXOFS_ERR("ERROR [exofs_check_page]: "
+ "entry in directory #%lu spans the page boundary"
+ "offset=%lu, inode=%llu",
+ dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ _LLU(le64_to_cpu(p->inode_no)));
+fail:
+ SetPageChecked(page);
+ SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+ struct address_space *mapping = dir->i_mapping;
+ struct page *page = read_mapping_page(mapping, n, NULL);
+
+ if (!IS_ERR(page)) {
+ kmap(page);
+ if (!PageChecked(page))
+ exofs_check_page(page);
+ if (PageError(page))
+ goto fail;
+ }
+ return page;
+
+fail:
+ exofs_put_page(page);
+ return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+ struct exofs_dir_entry *de)
+{
+ if (len != de->name_len)
+ return 0;
+ if (!de->inode_no)
+ return 0;
+ return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+ return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+ struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+ struct exofs_dir_entry *p =
+ (struct exofs_dir_entry *)(base + (offset&mask));
+ while ((char *)p < (char *)de) {
+ if (p->rec_len == 0)
+ break;
+ p = exofs_next_entry(p);
+ }
+ return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+ [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
+ [EXOFS_FT_REG_FILE] = DT_REG,
+ [EXOFS_FT_DIR] = DT_DIR,
+ [EXOFS_FT_CHRDEV] = DT_CHR,
+ [EXOFS_FT_BLKDEV] = DT_BLK,
+ [EXOFS_FT_FIFO] = DT_FIFO,
+ [EXOFS_FT_SOCK] = DT_SOCK,
+ [EXOFS_FT_SYMLINK] = DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+ mode_t mode = inode->i_mode;
+ de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ loff_t pos = filp->f_pos;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ unsigned int offset = pos & ~PAGE_CACHE_MASK;
+ unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned long npages = dir_pages(inode);
+ unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+ unsigned char *types = NULL;
+ int need_revalidate = (filp->f_version != inode->i_version);
+
+ if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+ return 0;
+
+ types = exofs_filetype_table;
+
+ for ( ; n < npages; n++, offset = 0) {
+ char *kaddr, *limit;
+ struct exofs_dir_entry *de;
+ struct page *page = exofs_get_page(inode, n);
+
+ if (IS_ERR(page)) {
+ EXOFS_ERR("ERROR: "
+ "bad page in #%lu",
+ inode->i_ino);
+ filp->f_pos += PAGE_CACHE_SIZE - offset;
+ return PTR_ERR(page);
+ }
+ kaddr = page_address(page);
+ if (unlikely(need_revalidate)) {
+ if (offset) {
+ offset = exofs_validate_entry(kaddr, offset,
+ chunk_mask);
+ filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ }
+ filp->f_version = inode->i_version;
+ need_revalidate = 0;
+ }
+ de = (struct exofs_dir_entry *)(kaddr + offset);
+ limit = kaddr + exofs_last_byte(inode, n) -
+ EXOFS_DIR_REC_LEN(1);
+ for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+ if (de->rec_len == 0) {
+ EXOFS_ERR("ERROR: "
+ "zero-length directory entry");
+ exofs_put_page(page);
+ return -EIO;
+ }
+ if (de->inode_no) {
+ int over;
+ unsigned char d_type = DT_UNKNOWN;
+
+ if (types && de->file_type < EXOFS_FT_MAX)
+ d_type = types[de->file_type];
+
+ offset = (char *)de - kaddr;
+ over = filldir(dirent, de->name, de->name_len,
+ (n<<PAGE_CACHE_SHIFT) | offset,
+ le64_to_cpu(de->inode_no),
+ d_type);
+ if (over) {
+ exofs_put_page(page);
+ return 0;
+ }
+ }
+ filp->f_pos += le16_to_cpu(de->rec_len);
+ }
+ exofs_put_page(page);
+ }
+
+ return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+ struct dentry *dentry, struct page **res_page)
+{
+ const unsigned char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+ unsigned long start, n;
+ unsigned long npages = dir_pages(dir);
+ struct page *page = NULL;
+ struct exofs_i_info *oi = exofs_i(dir);
+ struct exofs_dir_entry *de;
+
+ if (npages == 0)
+ goto out;
+
+ *res_page = NULL;
+
+ start = oi->i_dir_start_lookup;
+ if (start >= npages)
+ start = 0;
+ n = start;
+ do {
+ char *kaddr;
+ page = exofs_get_page(dir, n);
+ if (!IS_ERR(page)) {
+ kaddr = page_address(page);
+ de = (struct exofs_dir_entry *) kaddr;
+ kaddr += exofs_last_byte(dir, n) - reclen;
+ while ((char *) de <= kaddr) {
+ if (de->rec_len == 0) {
+ EXOFS_ERR(
+ "ERROR: exofs_find_entry: "
+ "zero-length directory entry");
+ exofs_put_page(page);
+ goto out;
+ }
+ if (exofs_match(namelen, name, de))
+ goto found;
+ de = exofs_next_entry(de);
+ }
+ exofs_put_page(page);
+ }
+ if (++n >= npages)
+ n = 0;
+ } while (n != start);
+out:
+ return NULL;
+
+found:
+ *res_page = page;
+ oi->i_dir_start_lookup = n;
+ return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+ struct page *page = exofs_get_page(dir, 0);
+ struct exofs_dir_entry *de = NULL;
+
+ if (!IS_ERR(page)) {
+ de = exofs_next_entry(
+ (struct exofs_dir_entry *)page_address(page));
+ *p = page;
+ }
+ return de;
+}
+
+ino_t exofs_parent_ino(struct dentry *child)
+{
+ struct page *page;
+ struct exofs_dir_entry *de;
+ ino_t ino;
+
+ de = exofs_dotdot(child->d_inode, &page);
+ if (!de)
+ return 0;
+
+ ino = le64_to_cpu(de->inode_no);
+ exofs_put_page(page);
+ return ino;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+ ino_t res = 0;
+ struct exofs_dir_entry *de;
+ struct page *page;
+
+ de = exofs_find_entry(dir, dentry, &page);
+ if (de) {
+ res = le64_to_cpu(de->inode_no);
+ exofs_put_page(page);
+ }
+ return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+ struct page *page, struct inode *inode)
+{
+ loff_t pos = page_offset(page) +
+ (char *) de - (char *) page_address(page);
+ unsigned len = le16_to_cpu(de->rec_len);
+ int err;
+
+ lock_page(page);
+ err = exofs_write_begin(NULL, page->mapping, pos, len,
+ AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (err)
+ EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+ err);
+
+ de->inode_no = cpu_to_le64(inode->i_ino);
+ exofs_set_de_type(de, inode);
+ if (likely(!err))
+ err = exofs_commit_chunk(page, pos, len);
+ exofs_put_page(page);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(dir);
+ return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const unsigned char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned chunk_size = exofs_chunk_size(dir);
+ unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+ unsigned short rec_len, name_len;
+ struct page *page = NULL;
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct exofs_dir_entry *de;
+ unsigned long npages = dir_pages(dir);
+ unsigned long n;
+ char *kaddr;
+ loff_t pos;
+ int err;
+
+ for (n = 0; n <= npages; n++) {
+ char *dir_end;
+
+ page = exofs_get_page(dir, n);
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto out;
+ lock_page(page);
+ kaddr = page_address(page);
+ dir_end = kaddr + exofs_last_byte(dir, n);
+ de = (struct exofs_dir_entry *)kaddr;
+ kaddr += PAGE_CACHE_SIZE - reclen;
+ while ((char *)de <= kaddr) {
+ if ((char *)de == dir_end) {
+ name_len = 0;
+ rec_len = chunk_size;
+ de->rec_len = cpu_to_le16(chunk_size);
+ de->inode_no = 0;
+ goto got_it;
+ }
+ if (de->rec_len == 0) {
+ EXOFS_ERR("ERROR: exofs_add_link: "
+ "zero-length directory entry");
+ err = -EIO;
+ goto out_unlock;
+ }
+ err = -EEXIST;
+ if (exofs_match(namelen, name, de))
+ goto out_unlock;
+ name_len = EXOFS_DIR_REC_LEN(de->name_len);
+ rec_len = le16_to_cpu(de->rec_len);
+ if (!de->inode_no && rec_len >= reclen)
+ goto got_it;
+ if (rec_len >= name_len + reclen)
+ goto got_it;
+ de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+ }
+ unlock_page(page);
+ exofs_put_page(page);
+ }
+
+ EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+ return -EINVAL;
+
+got_it:
+ pos = page_offset(page) +
+ (char *)de - (char *)page_address(page);
+ err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+ &page, NULL);
+ if (err)
+ goto out_unlock;
+ if (de->inode_no) {
+ struct exofs_dir_entry *de1 =
+ (struct exofs_dir_entry *)((char *)de + name_len);
+ de1->rec_len = cpu_to_le16(rec_len - name_len);
+ de->rec_len = cpu_to_le16(name_len);
+ de = de1;
+ }
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+ de->inode_no = cpu_to_le64(inode->i_ino);
+ exofs_set_de_type(de, inode);
+ err = exofs_commit_chunk(page, pos, rec_len);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(dir);
+ sbi->s_numfiles++;
+
+out_put:
+ exofs_put_page(page);
+out:
+ return err;
+out_unlock:
+ unlock_page(page);
+ goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ char *kaddr = page_address(page);
+ unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+ unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+ loff_t pos;
+ struct exofs_dir_entry *pde = NULL;
+ struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+ int err;
+
+ while (de < dir) {
+ if (de->rec_len == 0) {
+ EXOFS_ERR("ERROR: exofs_delete_entry:"
+ "zero-length directory entry");
+ err = -EIO;
+ goto out;
+ }
+ pde = de;
+ de = exofs_next_entry(de);
+ }
+ if (pde)
+ from = (char *)pde - (char *)page_address(page);
+ pos = page_offset(page) + from;
+ lock_page(page);
+ err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+ &page, NULL);
+ if (err)
+ EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+ err);
+ if (pde)
+ pde->rec_len = cpu_to_le16(to - from);
+ dir->inode_no = 0;
+ if (likely(!err))
+ err = exofs_commit_chunk(page, pos, to - from);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ sbi->s_numfiles--;
+out:
+ exofs_put_page(page);
+ return err;
+}
+
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page = grab_cache_page(mapping, 0);
+ unsigned chunk_size = exofs_chunk_size(inode);
+ struct exofs_dir_entry *de;
+ int err;
+ void *kaddr;
+
+ if (!page)
+ return -ENOMEM;
+
+ err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+ &page, NULL);
+ if (err) {
+ unlock_page(page);
+ goto fail;
+ }
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ de = (struct exofs_dir_entry *)kaddr;
+ de->name_len = 1;
+ de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+ memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+ de->inode_no = cpu_to_le64(inode->i_ino);
+ exofs_set_de_type(de, inode);
+
+ de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+ de->name_len = 2;
+ de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+ de->inode_no = cpu_to_le64(parent->i_ino);
+ memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+ exofs_set_de_type(de, inode);
+ kunmap_atomic(page, KM_USER0);
+ err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+ page_cache_release(page);
+ return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+ struct page *page = NULL;
+ unsigned long i, npages = dir_pages(inode);
+
+ for (i = 0; i < npages; i++) {
+ char *kaddr;
+ struct exofs_dir_entry *de;
+ page = exofs_get_page(inode, i);
+
+ if (IS_ERR(page))
+ continue;
+
+ kaddr = page_address(page);
+ de = (struct exofs_dir_entry *)kaddr;
+ kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+ while ((char *)de <= kaddr) {
+ if (de->rec_len == 0) {
+ EXOFS_ERR("ERROR: exofs_empty_dir: "
+ "zero-length directory entry"
+ "kaddr=%p, de=%p\n", kaddr, de);
+ goto not_empty;
+ }
+ if (de->inode_no != 0) {
+ /* check for . and .. */
+ if (de->name[0] != '.')
+ goto not_empty;
+ if (de->name_len > 2)
+ goto not_empty;
+ if (de->name_len < 2) {
+ if (le64_to_cpu(de->inode_no) !=
+ inode->i_ino)
+ goto not_empty;
+ } else if (de->name[1] != '.')
+ goto not_empty;
+ }
+ de = exofs_next_entry(de);
+ }
+ exofs_put_page(page);
+ }
+ return 1;
+
+not_empty:
+ exofs_put_page(page);
+ return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 00000000000..0fd4c785967
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+ printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+ do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+ struct osd_dev *s_dev; /* returned by get_osd_dev */
+ osd_id s_pid; /* partition ID of file system*/
+ int s_timeout; /* timeout for OSD operations */
+ uint64_t s_nextid; /* highest object ID used */
+ uint32_t s_numfiles; /* number of files on fs */
+ spinlock_t s_next_gen_lock; /* spinlock for gen # update */
+ u32 s_next_generation; /* next gen # to use */
+ atomic_t s_curr_pending; /* number of pending commands */
+ uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+ unsigned long i_flags; /* various atomic flags */
+ uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+ uint32_t i_dir_start_lookup; /* which page to start lookup */
+ wait_queue_head_t i_wq; /* wait queue for inode */
+ uint64_t i_commit_size; /* the object's written length */
+ uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
+ struct inode vfs_inode; /* normal in-memory inode */
+};
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED 0 /* object will be created soon*/
+#define OBJ_CREATED 1 /* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+ return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+ set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+ return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+ set_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+ if (likely(obj_created(oi)))
+ return 0;
+
+ return __exofs_wait_obj_created(oi);
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+ return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX 32000
+
+/*************************
+ * function declarations *
+ *************************/
+/* inode.c */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
+
+/* dir.c: */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+ struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+ struct inode *);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c: */
+extern const struct file_operations exofs_dir_operations;
+
+/* file.c */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
+/* inode.c */
+extern const struct address_space_operations exofs_aops;
+
+/* namei.c */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
+#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 00000000000..6ed7fe48475
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+ int datasync)
+{
+ int ret;
+ struct address_space *mapping = filp->f_mapping;
+
+ ret = filemap_write_and_wait(mapping);
+ if (ret)
+ return ret;
+
+ /*Note: file_fsync below also calles sync_blockdev, which is a no-op
+ * for exofs, but other then that it does sync_inode and
+ * sync_superblock which is what we need here.
+ */
+ return file_fsync(filp, dentry, datasync);
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+ exofs_file_fsync(file, file->f_path.dentry, 1);
+ /* TODO: Flush the OSD target */
+ return 0;
+}
+
+const struct file_operations exofs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = generic_file_aio_write,
+ .mmap = generic_file_mmap,
+ .open = generic_file_open,
+ .release = exofs_release_file,
+ .fsync = exofs_file_fsync,
+ .flush = exofs_flush,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+ .truncate = exofs_truncate,
+ .setattr = exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 00000000000..ba8d9fab469
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,1303 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
+
+#include "exofs.h"
+
+#ifdef CONFIG_EXOFS_DEBUG
+# define EXOFS_DEBUG_OBJ_ISIZE 1
+#endif
+
+struct page_collect {
+ struct exofs_sb_info *sbi;
+ struct request_queue *req_q;
+ struct inode *inode;
+ unsigned expected_pages;
+
+ struct bio *bio;
+ unsigned nr_pages;
+ unsigned long length;
+ loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+ struct inode *inode)
+{
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+
+ pcol->sbi = sbi;
+ pcol->req_q = req_q;
+ pcol->inode = inode;
+ pcol->expected_pages = expected_pages;
+
+ pcol->bio = NULL;
+ pcol->nr_pages = 0;
+ pcol->length = 0;
+ pcol->pg_first = -1;
+
+ EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+ expected_pages);
+}
+
+static void _pcol_reset(struct page_collect *pcol)
+{
+ pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+
+ pcol->bio = NULL;
+ pcol->nr_pages = 0;
+ pcol->length = 0;
+ pcol->pg_first = -1;
+ EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+ pcol->inode->i_ino, pcol->expected_pages);
+
+ /* this is probably the end of the loop but in writes
+ * it might not end here. don't be left with nothing
+ */
+ if (!pcol->expected_pages)
+ pcol->expected_pages = 128;
+}
+
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+ int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+
+ for (; pages; pages >>= 1) {
+ pcol->bio = bio_alloc(GFP_KERNEL, pages);
+ if (likely(pcol->bio))
+ return 0;
+ }
+
+ EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+ pcol->expected_pages);
+ return -ENOMEM;
+}
+
+static void pcol_free(struct page_collect *pcol)
+{
+ bio_put(pcol->bio);
+ pcol->bio = NULL;
+}
+
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+ unsigned len)
+{
+ int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+ if (unlikely(len != added_len))
+ return -ENOMEM;
+
+ ++pcol->nr_pages;
+ pcol->length += len;
+ return 0;
+}
+
+static int update_read_page(struct page *page, int ret)
+{
+ if (ret == 0) {
+ /* Everything is OK */
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ } else if (ret == -EFAULT) {
+ /* In this case we were trying to read something that wasn't on
+ * disk yet - return a page full of zeroes. This should be OK,
+ * because the object should be empty (if there was a write
+ * before this read, the read would be waiting with the page
+ * locked */
+ clear_highpage(page);
+
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ ret = 0; /* recovered error */
+ EXOFS_DBGMSG("recovered read error\n");
+ } else /* Error */
+ SetPageError(page);
+
+ return ret;
+}
+
+static void update_write_page(struct page *page, int ret)
+{
+ if (ret) {
+ mapping_set_error(page->mapping, ret);
+ SetPageError(page);
+ }
+ end_page_writeback(page);
+}
+
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+ bool do_unlock)
+{
+ struct bio_vec *bvec;
+ int i;
+ u64 resid;
+ u64 good_bytes;
+ u64 length = 0;
+ int ret = exofs_check_ok_resid(or, &resid, NULL);
+
+ osd_end_request(or);
+
+ if (likely(!ret))
+ good_bytes = pcol->length;
+ else if (!resid)
+ good_bytes = 0;
+ else
+ good_bytes = pcol->length - resid;
+
+ EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+ " length=0x%lx nr_pages=%u\n",
+ pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+ pcol->nr_pages);
+
+ __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+ struct page *page = bvec->bv_page;
+ struct inode *inode = page->mapping->host;
+ int page_stat;
+
+ if (inode != pcol->inode)
+ continue; /* osd might add more pages at end */
+
+ if (likely(length < good_bytes))
+ page_stat = 0;
+ else
+ page_stat = ret;
+
+ EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n",
+ inode->i_ino, page->index,
+ page_stat ? "bad_bytes" : "good_bytes");
+
+ ret = update_read_page(page, page_stat);
+ if (do_unlock)
+ unlock_page(page);
+ length += bvec->bv_len;
+ }
+
+ pcol_free(pcol);
+ EXOFS_DBGMSG("readpages_done END\n");
+ return ret;
+}
+
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+ struct page_collect *pcol = p;
+
+ __readpages_done(or, pcol, true);
+ atomic_dec(&pcol->sbi->s_curr_pending);
+ kfree(p);
+}
+
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+ struct page *page = bvec->bv_page;
+
+ if (rw == READ)
+ update_read_page(page, ret);
+ else
+ update_write_page(page, ret);
+
+ unlock_page(page);
+ }
+ pcol_free(pcol);
+}
+
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+ struct exofs_i_info *oi = exofs_i(pcol->inode);
+ struct osd_obj_id obj = {pcol->sbi->s_pid,
+ pcol->inode->i_ino + EXOFS_OBJ_OFF};
+ struct osd_request *or = NULL;
+ struct page_collect *pcol_copy = NULL;
+ loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (!pcol->bio)
+ return 0;
+
+ /* see comment in _readpage() about sync reads */
+ WARN_ON(is_sync && (pcol->nr_pages != 1));
+
+ or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ osd_req_read(or, &obj, pcol->bio, i_start);
+
+ if (is_sync) {
+ exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+ return __readpages_done(or, pcol, false);
+ }
+
+ pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+ if (!pcol_copy) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *pcol_copy = *pcol;
+ ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+ if (unlikely(ret))
+ goto err;
+
+ atomic_inc(&pcol->sbi->s_curr_pending);
+
+ EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+ obj.id, _LLU(i_start), pcol->length);
+
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+ return 0;
+
+err:
+ if (!is_sync)
+ _unlock_pcol_pages(pcol, ret, READ);
+ kfree(pcol_copy);
+ if (or)
+ osd_end_request(or);
+ return ret;
+}
+
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+ struct page_collect *pcol = data;
+ struct inode *inode = pcol->inode;
+ struct exofs_i_info *oi = exofs_i(inode);
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ size_t len;
+ int ret;
+
+ /* FIXME: Just for debugging, will be removed */
+ if (PageUptodate(page))
+ EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+ page->index);
+
+ if (page->index < end_index)
+ len = PAGE_CACHE_SIZE;
+ else if (page->index == end_index)
+ len = i_size & ~PAGE_CACHE_MASK;
+ else
+ len = 0;
+
+ if (!len || !obj_created(oi)) {
+ /* this will be out of bounds, or doesn't exist yet.
+ * Current page is cleared and the request is split
+ */
+ clear_highpage(page);
+
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+
+ unlock_page(page);
+ EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+ " splitting\n", inode->i_ino, page->index);
+
+ return read_exec(pcol, false);
+ }
+
+try_again:
+
+ if (unlikely(pcol->pg_first == -1)) {
+ pcol->pg_first = page->index;
+ } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+ page->index)) {
+ /* Discontinuity detected, split the request */
+ ret = read_exec(pcol, false);
+ if (unlikely(ret))
+ goto fail;
+ goto try_again;
+ }
+
+ if (!pcol->bio) {
+ ret = pcol_try_alloc(pcol);
+ if (unlikely(ret))
+ goto fail;
+ }
+
+ if (len != PAGE_CACHE_SIZE)
+ zero_user(page, len, PAGE_CACHE_SIZE - len);
+
+ EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+ inode->i_ino, page->index, len);
+
+ ret = pcol_add_page(pcol, page, len);
+ if (ret) {
+ EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+ "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+ page, len, pcol->nr_pages, pcol->length);
+
+ /* split the request, and start again with current page */
+ ret = read_exec(pcol, false);
+ if (unlikely(ret))
+ goto fail;
+
+ goto try_again;
+ }
+
+ return 0;
+
+fail:
+ /* SetPageError(page); ??? */
+ unlock_page(page);
+ return ret;
+}
+
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct page_collect pcol;
+ int ret;
+
+ _pcol_init(&pcol, nr_pages, mapping->host);
+
+ ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+ if (ret) {
+ EXOFS_ERR("read_cache_pages => %d\n", ret);
+ return ret;
+ }
+
+ return read_exec(&pcol, false);
+}
+
+static int _readpage(struct page *page, bool is_sync)
+{
+ struct page_collect pcol;
+ int ret;
+
+ _pcol_init(&pcol, 1, page->mapping->host);
+
+ /* readpage_strip might call read_exec(,async) inside at several places
+ * but this is safe for is_async=0 since read_exec will not do anything
+ * when we have a single page.
+ */
+ ret = readpage_strip(&pcol, page);
+ if (ret) {
+ EXOFS_ERR("_readpage => %d\n", ret);
+ return ret;
+ }
+
+ return read_exec(&pcol, is_sync);
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+ return _readpage(page, false);
+}
+
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+ struct page_collect *pcol = p;
+ struct bio_vec *bvec;
+ int i;
+ u64 resid;
+ u64 good_bytes;
+ u64 length = 0;
+
+ int ret = exofs_check_ok_resid(or, NULL, &resid);
+
+ osd_end_request(or);
+ atomic_dec(&pcol->sbi->s_curr_pending);
+
+ if (likely(!ret))
+ good_bytes = pcol->length;
+ else if (!resid)
+ good_bytes = 0;
+ else
+ good_bytes = pcol->length - resid;
+
+ EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+ " length=0x%lx nr_pages=%u\n",
+ pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+ pcol->nr_pages);
+
+ __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+ struct page *page = bvec->bv_page;
+ struct inode *inode = page->mapping->host;
+ int page_stat;
+
+ if (inode != pcol->inode)
+ continue; /* osd might add more pages to a bio */
+
+ if (likely(length < good_bytes))
+ page_stat = 0;
+ else
+ page_stat = ret;
+
+ update_write_page(page, page_stat);
+ unlock_page(page);
+ EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n",
+ inode->i_ino, page->index, page_stat);
+
+ length += bvec->bv_len;
+ }
+
+ pcol_free(pcol);
+ kfree(pcol);
+ EXOFS_DBGMSG("writepages_done END\n");
+}
+
+static int write_exec(struct page_collect *pcol)
+{
+ struct exofs_i_info *oi = exofs_i(pcol->inode);
+ struct osd_obj_id obj = {pcol->sbi->s_pid,
+ pcol->inode->i_ino + EXOFS_OBJ_OFF};
+ struct osd_request *or = NULL;
+ struct page_collect *pcol_copy = NULL;
+ loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (!pcol->bio)
+ return 0;
+
+ or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+ if (!pcol_copy) {
+ EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *pcol_copy = *pcol;
+
+ osd_req_write(or, &obj, pcol_copy->bio, i_start);
+ ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+ if (unlikely(ret)) {
+ EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+ goto err;
+ }
+
+ atomic_inc(&pcol->sbi->s_curr_pending);
+ EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+ pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+ pcol->length);
+ /* pages ownership was passed to pcol_copy */
+ _pcol_reset(pcol);
+ return 0;
+
+err:
+ _unlock_pcol_pages(pcol, ret, WRITE);
+ kfree(pcol_copy);
+ if (or)
+ osd_end_request(or);
+ return ret;
+}
+
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+ struct writeback_control *wbc_unused, void *data)
+{
+ struct page_collect *pcol = data;
+ struct inode *inode = pcol->inode;
+ struct exofs_i_info *oi = exofs_i(inode);
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ size_t len;
+ int ret;
+
+ BUG_ON(!PageLocked(page));
+
+ ret = wait_obj_created(oi);
+ if (unlikely(ret))
+ goto fail;
+
+ if (page->index < end_index)
+ /* in this case, the page is within the limits of the file */
+ len = PAGE_CACHE_SIZE;
+ else {
+ len = i_size & ~PAGE_CACHE_MASK;
+
+ if (page->index > end_index || !len) {
+ /* in this case, the page is outside the limits
+ * (truncate in progress)
+ */
+ ret = write_exec(pcol);
+ if (unlikely(ret))
+ goto fail;
+ if (PageError(page))
+ ClearPageError(page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+try_again:
+
+ if (unlikely(pcol->pg_first == -1)) {
+ pcol->pg_first = page->index;
+ } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+ page->index)) {
+ /* Discontinuity detected, split the request */
+ ret = write_exec(pcol);
+ if (unlikely(ret))
+ goto fail;
+ goto try_again;
+ }
+
+ if (!pcol->bio) {
+ ret = pcol_try_alloc(pcol);
+ if (unlikely(ret))
+ goto fail;
+ }
+
+ EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+ inode->i_ino, page->index, len);
+
+ ret = pcol_add_page(pcol, page, len);
+ if (unlikely(ret)) {
+ EXOFS_DBGMSG("Failed pcol_add_page "
+ "nr_pages=%u total_length=0x%lx\n",
+ pcol->nr_pages, pcol->length);
+
+ /* split the request, next loop will start again */
+ ret = write_exec(pcol);
+ if (unlikely(ret)) {
+ EXOFS_DBGMSG("write_exec faild => %d", ret);
+ goto fail;
+ }
+
+ goto try_again;
+ }
+
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+
+ return 0;
+
+fail:
+ set_bit(AS_EIO, &page->mapping->flags);
+ unlock_page(page);
+ return ret;
+}
+
+static int exofs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct page_collect pcol;
+ long start, end, expected_pages;
+ int ret;
+
+ start = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = (wbc->range_end == LLONG_MAX) ?
+ start + mapping->nrpages :
+ wbc->range_end >> PAGE_CACHE_SHIFT;
+
+ if (start || end)
+ expected_pages = min(end - start + 1, 32L);
+ else
+ expected_pages = mapping->nrpages;
+
+ EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+ " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+ mapping->host->i_ino, wbc->range_start, wbc->range_end,
+ mapping->nrpages, start, end);
+
+ _pcol_init(&pcol, expected_pages, mapping->host);
+
+ ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+ if (ret) {
+ EXOFS_ERR("write_cache_pages => %d\n", ret);
+ return ret;
+ }
+
+ return write_exec(&pcol);
+}
+
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct page_collect pcol;
+ int ret;
+
+ _pcol_init(&pcol, 1, page->mapping->host);
+
+ ret = writepage_strip(page, NULL, &pcol);
+ if (ret) {
+ EXOFS_ERR("exofs_writepage => %d\n", ret);
+ return ret;
+ }
+
+ return write_exec(&pcol);
+}
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret = 0;
+ struct page *page;
+
+ page = *pagep;
+ if (page == NULL) {
+ ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata);
+ if (ret) {
+ EXOFS_DBGMSG("simple_write_begin faild\n");
+ return ret;
+ }
+
+ page = *pagep;
+ }
+
+ /* read modify write */
+ if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+ ret = _readpage(page, true);
+ if (ret) {
+ /*SetPageError was done by _readpage. Is it ok?*/
+ unlock_page(page);
+ EXOFS_DBGMSG("__readpage_filler faild\n");
+ }
+ }
+
+ return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ *pagep = NULL;
+
+ return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata);
+}
+
+const struct address_space_operations exofs_aops = {
+ .readpage = exofs_readpage,
+ .readpages = exofs_readpages,
+ .writepage = exofs_writepage,
+ .writepages = exofs_writepages,
+ .write_begin = exofs_write_begin_export,
+ .write_end = simple_write_end,
+};
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+ struct exofs_i_info *oi = exofs_i(inode);
+
+ return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ map_bh(bh_result, inode->i_sb, iblock);
+ return 0;
+}
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+ OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute. We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct exofs_i_info *oi = exofs_i(inode);
+ struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+ struct osd_request *or;
+ struct osd_attr attr;
+ loff_t isize = i_size_read(inode);
+ __be64 newsize;
+ int ret;
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+ || S_ISLNK(inode->i_mode)))
+ return;
+ if (exofs_inode_is_fast_symlink(inode))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
+ goto fail;
+ }
+
+ osd_req_set_attributes(or, &obj);
+
+ newsize = cpu_to_be64((u64)isize);
+ attr = g_attr_logical_length;
+ attr.val_ptr = &newsize;
+ osd_req_add_set_attr_list(or, &attr, 1);
+
+ /* if we are about to truncate an object, and it hasn't been
+ * created yet, wait
+ */
+ if (unlikely(wait_obj_created(oi)))
+ goto fail;
+
+ ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+ osd_end_request(or);
+ if (ret)
+ goto fail;
+
+out:
+ mark_inode_dirty(inode);
+ return;
+fail:
+ make_bad_inode(inode);
+ goto out;
+}
+
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ error = inode_change_ok(inode, iattr);
+ if (error)
+ return error;
+
+ error = inode_setattr(inode, iattr);
+ return error;
+}
+
+/*
+ * Read an inode from the OSD, and return it as is. We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+ struct exofs_fcb *inode, uint64_t *sanity)
+{
+ struct exofs_sb_info *sbi = sb->s_fs_info;
+ struct osd_request *or;
+ struct osd_attr attr;
+ struct osd_obj_id obj = {sbi->s_pid,
+ oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+ int ret;
+
+ exofs_make_credential(oi->i_cred, &obj);
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+ return -ENOMEM;
+ }
+ osd_req_get_attributes(or, &obj);
+
+ /* we need the inode attribute */
+ osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+ /* we get the size attributes to do a sanity check */
+ osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+#endif
+
+ ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+ if (ret)
+ goto out;
+
+ attr = g_attr_inode_data;
+ ret = extract_attr_from_req(or, &attr);
+ if (ret) {
+ EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+ goto out;
+ }
+
+ WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+ memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+ attr = g_attr_logical_length;
+ ret = extract_attr_from_req(or, &attr);
+ if (ret) {
+ EXOFS_ERR("ERROR: extract attr from or failed\n");
+ goto out;
+ }
+ *sanity = get_unaligned_be64(attr.val_ptr);
+#endif
+
+out:
+ osd_end_request(or);
+ return ret;
+}
+
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+ struct exofs_i_info *oi;
+ struct exofs_fcb fcb;
+ struct inode *inode;
+ uint64_t uninitialized_var(sanity);
+ int ret;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+ oi = exofs_i(inode);
+
+ /* read the inode from the osd */
+ ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+ if (ret)
+ goto bad_inode;
+
+ init_waitqueue_head(&oi->i_wq);
+ set_obj_created(oi);
+
+ /* copy stuff from on-disk struct to in-memory struct */
+ inode->i_mode = le16_to_cpu(fcb.i_mode);
+ inode->i_uid = le32_to_cpu(fcb.i_uid);
+ inode->i_gid = le32_to_cpu(fcb.i_gid);
+ inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+ inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+ inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+ inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+ inode->i_ctime.tv_nsec =
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+ oi->i_commit_size = le64_to_cpu(fcb.i_size);
+ i_size_write(inode, oi->i_commit_size);
+ inode->i_blkbits = EXOFS_BLKSHIFT;
+ inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+ if ((inode->i_size != sanity) &&
+ (!exofs_inode_is_fast_symlink(inode))) {
+ EXOFS_ERR("WARNING: Size of object from inode and "
+ "attributes differ (%lld != %llu)\n",
+ inode->i_size, _LLU(sanity));
+ }
+#endif
+
+ oi->i_dir_start_lookup = 0;
+
+ if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (fcb.i_data[0])
+ inode->i_rdev =
+ old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+ else
+ inode->i_rdev =
+ new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+ } else {
+ memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+ }
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &exofs_file_inode_operations;
+ inode->i_fop = &exofs_file_operations;
+ inode->i_mapping->a_ops = &exofs_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &exofs_dir_inode_operations;
+ inode->i_fop = &exofs_dir_operations;
+ inode->i_mapping->a_ops = &exofs_aops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ if (exofs_inode_is_fast_symlink(inode))
+ inode->i_op = &exofs_fast_symlink_inode_operations;
+ else {
+ inode->i_op = &exofs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &exofs_aops;
+ }
+ } else {
+ inode->i_op = &exofs_special_inode_operations;
+ if (fcb.i_data[0])
+ init_special_inode(inode, inode->i_mode,
+ old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+ else
+ init_special_inode(inode, inode->i_mode,
+ new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+ }
+
+ unlock_new_inode(inode);
+ return inode;
+
+bad_inode:
+ iget_failed(inode);
+ return ERR_PTR(ret);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+ if (!obj_created(oi)) {
+ BUG_ON(!obj_2bcreated(oi));
+ wait_event(oi->i_wq, obj_created(oi));
+ }
+ return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+/*
+ * Callback function from exofs_new_inode(). The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *or, void *p)
+{
+ struct inode *inode = p;
+ struct exofs_i_info *oi = exofs_i(inode);
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ int ret;
+
+ ret = exofs_check_ok(or);
+ osd_end_request(or);
+ atomic_dec(&sbi->s_curr_pending);
+
+ if (unlikely(ret)) {
+ EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+ _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+ make_bad_inode(inode);
+ } else
+ set_obj_created(oi);
+
+ atomic_dec(&inode->i_count);
+ wake_up(&oi->i_wq);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+ struct super_block *sb;
+ struct inode *inode;
+ struct exofs_i_info *oi;
+ struct exofs_sb_info *sbi;
+ struct osd_request *or;
+ struct osd_obj_id obj;
+ int ret;
+
+ sb = dir->i_sb;
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ oi = exofs_i(inode);
+
+ init_waitqueue_head(&oi->i_wq);
+ set_obj_2bcreated(oi);
+
+ sbi = sb->s_fs_info;
+
+ sb->s_dirt = 1;
+ inode->i_uid = current->cred->fsuid;
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ inode->i_gid = current->cred->fsgid;
+ }
+ inode->i_mode = mode;
+
+ inode->i_ino = sbi->s_nextid++;
+ inode->i_blkbits = EXOFS_BLKSHIFT;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ oi->i_commit_size = inode->i_size = 0;
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+ insert_inode_hash(inode);
+
+ mark_inode_dirty(inode);
+
+ obj.partition = sbi->s_pid;
+ obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+ exofs_make_credential(oi->i_cred, &obj);
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ osd_req_create_object(or, &obj);
+
+ /* increment the refcount so that the inode will still be around when we
+ * reach the callback
+ */
+ atomic_inc(&inode->i_count);
+
+ ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+ if (ret) {
+ atomic_dec(&inode->i_count);
+ osd_end_request(or);
+ return ERR_PTR(-EIO);
+ }
+ atomic_inc(&sbi->s_curr_pending);
+
+ return inode;
+}
+
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+ struct exofs_sb_info *sbi;
+ struct exofs_fcb fcb;
+};
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *or, void *p)
+{
+ struct updatei_args *args = p;
+
+ osd_end_request(or);
+
+ atomic_dec(&args->sbi->s_curr_pending);
+
+ kfree(args);
+}
+
+/*
+ * Write the inode to the OSD. Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+ struct exofs_i_info *oi = exofs_i(inode);
+ struct super_block *sb = inode->i_sb;
+ struct exofs_sb_info *sbi = sb->s_fs_info;
+ struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+ struct osd_request *or;
+ struct osd_attr attr;
+ struct exofs_fcb *fcb;
+ struct updatei_args *args;
+ int ret;
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ return -ENOMEM;
+
+ fcb = &args->fcb;
+
+ fcb->i_mode = cpu_to_le16(inode->i_mode);
+ fcb->i_uid = cpu_to_le32(inode->i_uid);
+ fcb->i_gid = cpu_to_le32(inode->i_gid);
+ fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+ fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ oi->i_commit_size = i_size_read(inode);
+ fcb->i_size = cpu_to_le64(oi->i_commit_size);
+ fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ fcb->i_data[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ fcb->i_data[1] = 0;
+ } else {
+ fcb->i_data[0] = 0;
+ fcb->i_data[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ fcb->i_data[2] = 0;
+ }
+ } else
+ memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+ ret = -ENOMEM;
+ goto free_args;
+ }
+
+ osd_req_set_attributes(or, &obj);
+
+ attr = g_attr_inode_data;
+ attr.val_ptr = fcb;
+ osd_req_add_set_attr_list(or, &attr, 1);
+
+ if (!obj_created(oi)) {
+ EXOFS_DBGMSG("!obj_created\n");
+ BUG_ON(!obj_2bcreated(oi));
+ wait_event(oi->i_wq, obj_created(oi));
+ EXOFS_DBGMSG("wait_event done\n");
+ }
+
+ if (do_sync) {
+ ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+ osd_end_request(or);
+ goto free_args;
+ } else {
+ args->sbi = sbi;
+
+ ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+ if (ret) {
+ osd_end_request(or);
+ goto free_args;
+ }
+ atomic_inc(&sbi->s_curr_pending);
+ goto out; /* deallocation in updatei_done */
+ }
+
+free_args:
+ kfree(args);
+out:
+ EXOFS_DBGMSG("ret=>%d\n", ret);
+ return ret;
+}
+
+int exofs_write_inode(struct inode *inode, int wait)
+{
+ return exofs_update_inode(inode, wait);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *or, void *p)
+{
+ struct exofs_sb_info *sbi;
+ osd_end_request(or);
+ sbi = p;
+ atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero. We remove the object
+ * from the OSD here. We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+ struct exofs_i_info *oi = exofs_i(inode);
+ struct super_block *sb = inode->i_sb;
+ struct exofs_sb_info *sbi = sb->s_fs_info;
+ struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+ struct osd_request *or;
+ int ret;
+
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (is_bad_inode(inode))
+ goto no_delete;
+
+ mark_inode_dirty(inode);
+ exofs_update_inode(inode, inode_needs_sync(inode));
+
+ inode->i_size = 0;
+ if (inode->i_blocks)
+ exofs_truncate(inode);
+
+ clear_inode(inode);
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+ return;
+ }
+
+ osd_req_remove_object(or, &obj);
+
+ /* if we are deleting an obj that hasn't been created yet, wait */
+ if (!obj_created(oi)) {
+ BUG_ON(!obj_2bcreated(oi));
+ wait_event(oi->i_wq, obj_created(oi));
+ }
+
+ ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+ if (ret) {
+ EXOFS_ERR(
+ "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+ osd_end_request(or);
+ return;
+ }
+ atomic_inc(&sbi->s_curr_pending);
+
+ return;
+
+no_delete:
+ clear_inode(inode);
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 00000000000..77fdd765e76
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+ int err = exofs_add_link(dentry, inode);
+ if (!err) {
+ d_instantiate(dentry, inode);
+ return 0;
+ }
+ inode_dec_link_count(inode);
+ iput(inode);
+ return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+ ino_t ino;
+
+ if (dentry->d_name.len > EXOFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ino = exofs_inode_by_name(dir, dentry);
+ inode = NULL;
+ if (ino) {
+ inode = exofs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+ return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ struct inode *inode = exofs_new_inode(dir, mode);
+ int err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &exofs_file_inode_operations;
+ inode->i_fop = &exofs_file_operations;
+ inode->i_mapping->a_ops = &exofs_aops;
+ mark_inode_dirty(inode);
+ err = exofs_add_nondir(dentry, inode);
+ }
+ return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+ dev_t rdev)
+{
+ struct inode *inode;
+ int err;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ inode = exofs_new_inode(dir, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, inode->i_mode, rdev);
+ mark_inode_dirty(inode);
+ err = exofs_add_nondir(dentry, inode);
+ }
+ return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct super_block *sb = dir->i_sb;
+ int err = -ENAMETOOLONG;
+ unsigned l = strlen(symname)+1;
+ struct inode *inode;
+ struct exofs_i_info *oi;
+
+ if (l > sb->s_blocksize)
+ goto out;
+
+ inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out;
+
+ oi = exofs_i(inode);
+ if (l > sizeof(oi->i_data)) {
+ /* slow symlink */
+ inode->i_op = &exofs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &exofs_aops;
+ memset(oi->i_data, 0, sizeof(oi->i_data));
+
+ err = page_symlink(inode, symname, l);
+ if (err)
+ goto out_fail;
+ } else {
+ /* fast symlink */
+ inode->i_op = &exofs_fast_symlink_inode_operations;
+ memcpy(oi->i_data, symname, l);
+ inode->i_size = l-1;
+ }
+ mark_inode_dirty(inode);
+
+ err = exofs_add_nondir(dentry, inode);
+out:
+ return err;
+
+out_fail:
+ inode_dec_link_count(inode);
+ iput(inode);
+ goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+
+ if (inode->i_nlink >= EXOFS_LINK_MAX)
+ return -EMLINK;
+
+ inode->i_ctime = CURRENT_TIME;
+ inode_inc_link_count(inode);
+ atomic_inc(&inode->i_count);
+
+ return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct inode *inode;
+ int err = -EMLINK;
+
+ if (dir->i_nlink >= EXOFS_LINK_MAX)
+ goto out;
+
+ inode_inc_link_count(dir);
+
+ inode = exofs_new_inode(dir, S_IFDIR | mode);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_dir;
+
+ inode->i_op = &exofs_dir_inode_operations;
+ inode->i_fop = &exofs_dir_operations;
+ inode->i_mapping->a_ops = &exofs_aops;
+
+ inode_inc_link_count(inode);
+
+ err = exofs_make_empty(inode, dir);
+ if (err)
+ goto out_fail;
+
+ err = exofs_add_link(dentry, inode);
+ if (err)
+ goto out_fail;
+
+ d_instantiate(dentry, inode);
+out:
+ return err;
+
+out_fail:
+ inode_dec_link_count(inode);
+ inode_dec_link_count(inode);
+ iput(inode);
+out_dir:
+ inode_dec_link_count(dir);
+ goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct exofs_dir_entry *de;
+ struct page *page;
+ int err = -ENOENT;
+
+ de = exofs_find_entry(dir, dentry, &page);
+ if (!de)
+ goto out;
+
+ err = exofs_delete_entry(de, page);
+ if (err)
+ goto out;
+
+ inode->i_ctime = dir->i_ctime;
+ inode_dec_link_count(inode);
+ err = 0;
+out:
+ return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ int err = -ENOTEMPTY;
+
+ if (exofs_empty_dir(inode)) {
+ err = exofs_unlink(dir, dentry);
+ if (!err) {
+ inode->i_size = 0;
+ inode_dec_link_count(inode);
+ inode_dec_link_count(dir);
+ }
+ }
+ return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct page *dir_page = NULL;
+ struct exofs_dir_entry *dir_de = NULL;
+ struct page *old_page;
+ struct exofs_dir_entry *old_de;
+ int err = -ENOENT;
+
+ old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+ if (!old_de)
+ goto out;
+
+ if (S_ISDIR(old_inode->i_mode)) {
+ err = -EIO;
+ dir_de = exofs_dotdot(old_inode, &dir_page);
+ if (!dir_de)
+ goto out_old;
+ }
+
+ if (new_inode) {
+ struct page *new_page;
+ struct exofs_dir_entry *new_de;
+
+ err = -ENOTEMPTY;
+ if (dir_de && !exofs_empty_dir(new_inode))
+ goto out_dir;
+
+ err = -ENOENT;
+ new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+ if (!new_de)
+ goto out_dir;
+ inode_inc_link_count(old_inode);
+ err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+ new_inode->i_ctime = CURRENT_TIME;
+ if (dir_de)
+ drop_nlink(new_inode);
+ inode_dec_link_count(new_inode);
+ if (err)
+ goto out_dir;
+ } else {
+ if (dir_de) {
+ err = -EMLINK;
+ if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+ goto out_dir;
+ }
+ inode_inc_link_count(old_inode);
+ err = exofs_add_link(new_dentry, old_inode);
+ if (err) {
+ inode_dec_link_count(old_inode);
+ goto out_dir;
+ }
+ if (dir_de)
+ inode_inc_link_count(new_dir);
+ }
+
+ old_inode->i_ctime = CURRENT_TIME;
+
+ exofs_delete_entry(old_de, old_page);
+ inode_dec_link_count(old_inode);
+
+ if (dir_de) {
+ err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+ inode_dec_link_count(old_dir);
+ if (err)
+ goto out_dir;
+ }
+ return 0;
+
+
+out_dir:
+ if (dir_de) {
+ kunmap(dir_page);
+ page_cache_release(dir_page);
+ }
+out_old:
+ kunmap(old_page);
+ page_cache_release(old_page);
+out:
+ return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+ .create = exofs_create,
+ .lookup = exofs_lookup,
+ .link = exofs_link,
+ .unlink = exofs_unlink,
+ .symlink = exofs_symlink,
+ .mkdir = exofs_mkdir,
+ .rmdir = exofs_rmdir,
+ .mknod = exofs_mknod,
+ .rename = exofs_rename,
+ .setattr = exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+ .setattr = exofs_setattr,
+};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 00000000000..b249ae97fb1
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+
+#include "exofs.h"
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
+{
+ struct osd_sense_info osi;
+ int ret = osd_req_decode_sense(or, &osi);
+
+ if (ret) { /* translate to Linux codes */
+ if (osi.additional_code == scsi_invalid_field_in_cdb) {
+ if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+ ret = -EFAULT;
+ if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+ ret = -ENOENT;
+ else
+ ret = -EINVAL;
+ } else if (osi.additional_code == osd_quota_error)
+ ret = -ENOSPC;
+ else
+ ret = -EIO;
+ }
+
+ /* FIXME: should be include in osd_sense_info */
+ if (in_resid)
+ *in_resid = or->in.req ? or->in.req->data_len : 0;
+
+ if (out_resid)
+ *out_resid = or->out.req ? or->out.req->data_len : 0;
+
+ return ret;
+}
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+ osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+ int ret;
+
+ or->timeout = timeout;
+ ret = osd_finalize_request(or, 0, credential, NULL);
+ if (ret) {
+ EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+ return ret;
+ }
+
+ ret = osd_execute_request(or);
+
+ if (ret)
+ EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+ /* osd_req_decode_sense(or, ret); */
+ return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+ void *caller_context, u8 *cred)
+{
+ int ret;
+
+ ret = osd_finalize_request(or, 0, cred, NULL);
+ if (ret) {
+ EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+ return ret;
+ }
+
+ ret = osd_execute_request_async(or, async_done, caller_context);
+
+ if (ret)
+ EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+ return ret;
+}
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+ struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+ void *iter = NULL;
+ int nelem;
+
+ do {
+ nelem = 1;
+ osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+ if ((cur_attr.attr_page == attr->attr_page) &&
+ (cur_attr.attr_id == attr->attr_id)) {
+ attr->len = cur_attr.len;
+ attr->val_ptr = cur_attr.val_ptr;
+ return 0;
+ }
+ } while (iter);
+
+ return -EIO;
+}
+
+int osd_req_read_kern(struct osd_request *or,
+ const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+ struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+ struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+ if (!bio)
+ return -ENOMEM;
+
+ osd_req_read(or, obj, bio, offset);
+ return 0;
+}
+
+int osd_req_write_kern(struct osd_request *or,
+ const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+ struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+ struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+ if (!bio)
+ return -ENOMEM;
+
+ osd_req_write(or, obj, bio, offset);
+ return 0;
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 00000000000..9f1985e857e
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+ const char *dev_name;
+ uint64_t pid;
+ int timeout;
+};
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+
+/*
+ * Our mount-time options. These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that. 32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+ {Opt_pid, "pid=%u"},
+ {Opt_to, "to=%u"},
+ {Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method. Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ bool s_pid = false;
+
+ EXOFS_DBGMSG("parse_options %s\n", options);
+ /* defaults */
+ memset(opts, 0, sizeof(*opts));
+ opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ char str[32];
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_pid:
+ if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+ return -EINVAL;
+ opts->pid = simple_strtoull(str, NULL, 0);
+ if (opts->pid < EXOFS_MIN_PID) {
+ EXOFS_ERR("Partition ID must be >= %u",
+ EXOFS_MIN_PID);
+ return -EINVAL;
+ }
+ s_pid = 1;
+ break;
+ case Opt_to:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ if (option <= 0) {
+ EXOFS_ERR("Timout must be > 0");
+ return -EINVAL;
+ }
+ opts->timeout = option * HZ;
+ break;
+ }
+ }
+
+ if (!s_pid) {
+ EXOFS_ERR("Need to specify the following options:\n");
+ EXOFS_ERR(" -o pid=pid_no_to_use\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache. Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+ struct exofs_i_info *oi;
+
+ oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+ if (!oi)
+ return NULL;
+
+ oi->vfs_inode.i_version = 1;
+ return &oi->vfs_inode;
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+ struct exofs_i_info *oi = foo;
+
+ inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+ exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+ sizeof(struct exofs_i_info), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ exofs_init_once);
+ if (exofs_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+ struct exofs_sb_info *sbi;
+ struct exofs_fscb *fscb;
+ struct osd_request *or;
+ struct osd_obj_id obj;
+ int ret;
+
+ fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+ if (!fscb) {
+ EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
+ return;
+ }
+
+ lock_kernel();
+ sbi = sb->s_fs_info;
+ fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+ fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+ fscb->s_magic = cpu_to_le16(sb->s_magic);
+ fscb->s_newfs = 0;
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+ goto out;
+ }
+
+ obj.partition = sbi->s_pid;
+ obj.id = EXOFS_SUPER_ID;
+ ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+ if (unlikely(ret)) {
+ EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+ goto out;
+ }
+
+ ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+ if (unlikely(ret)) {
+ EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+ goto out;
+ }
+ sb->s_dirt = 0;
+
+out:
+ if (or)
+ osd_end_request(or);
+ unlock_kernel();
+ kfree(fscb);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock. We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+ int num_pend;
+ struct exofs_sb_info *sbi = sb->s_fs_info;
+
+ /* make sure there are no pending commands */
+ for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+ num_pend = atomic_read(&sbi->s_curr_pending)) {
+ wait_queue_head_t wq;
+ init_waitqueue_head(&wq);
+ wait_event_timeout(wq,
+ (atomic_read(&sbi->s_curr_pending) == 0),
+ msecs_to_jiffies(100));
+ }
+
+ osduld_put_device(sbi->s_dev);
+ kfree(sb->s_fs_info);
+ sb->s_fs_info = NULL;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct inode *root;
+ struct exofs_mountopt *opts = data;
+ struct exofs_sb_info *sbi; /*extended info */
+ struct exofs_fscb fscb; /*on-disk superblock info */
+ struct osd_request *or = NULL;
+ struct osd_obj_id obj;
+ int ret;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+ sb->s_fs_info = sbi;
+
+ /* use mount options to fill superblock */
+ sbi->s_dev = osduld_path_lookup(opts->dev_name);
+ if (IS_ERR(sbi->s_dev)) {
+ ret = PTR_ERR(sbi->s_dev);
+ sbi->s_dev = NULL;
+ goto free_sbi;
+ }
+
+ sbi->s_pid = opts->pid;
+ sbi->s_timeout = opts->timeout;
+
+ /* fill in some other data by hand */
+ memset(sb->s_id, 0, sizeof(sb->s_id));
+ strcpy(sb->s_id, "exofs");
+ sb->s_blocksize = EXOFS_BLKSIZE;
+ sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ atomic_set(&sbi->s_curr_pending, 0);
+ sb->s_bdev = NULL;
+ sb->s_dev = 0;
+
+ /* read data from on-disk superblock object */
+ obj.partition = sbi->s_pid;
+ obj.id = EXOFS_SUPER_ID;
+ exofs_make_credential(sbi->s_cred, &obj);
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ if (!silent)
+ EXOFS_ERR(
+ "exofs_fill_super: osd_start_request failed.\n");
+ ret = -ENOMEM;
+ goto free_sbi;
+ }
+ ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
+ if (unlikely(ret)) {
+ if (!silent)
+ EXOFS_ERR(
+ "exofs_fill_super: osd_req_read_kern failed.\n");
+ ret = -ENOMEM;
+ goto free_sbi;
+ }
+
+ ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+ if (unlikely(ret)) {
+ if (!silent)
+ EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
+ ret = -EIO;
+ goto free_sbi;
+ }
+
+ sb->s_magic = le16_to_cpu(fscb.s_magic);
+ sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+ sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+ /* make sure what we read from the object store is correct */
+ if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+ if (!silent)
+ EXOFS_ERR("ERROR: Bad magic value\n");
+ ret = -EINVAL;
+ goto free_sbi;
+ }
+
+ /* start generation numbers from a random point */
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+ spin_lock_init(&sbi->s_next_gen_lock);
+
+ /* set up operation vectors */
+ sb->s_op = &exofs_sops;
+ sb->s_export_op = &exofs_export_ops;
+ root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+ if (IS_ERR(root)) {
+ EXOFS_ERR("ERROR: exofs_iget failed\n");
+ ret = PTR_ERR(root);
+ goto free_sbi;
+ }
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root) {
+ iput(root);
+ EXOFS_ERR("ERROR: get root inode failed\n");
+ ret = -ENOMEM;
+ goto free_sbi;
+ }
+
+ if (!S_ISDIR(root->i_mode)) {
+ dput(sb->s_root);
+ sb->s_root = NULL;
+ EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+ root->i_mode);
+ ret = -EINVAL;
+ goto free_sbi;
+ }
+
+ ret = 0;
+out:
+ if (or)
+ osd_end_request(or);
+ return ret;
+
+free_sbi:
+ osduld_put_device(sbi->s_dev); /* NULL safe */
+ kfree(sbi);
+ goto out;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+ int flags, const char *dev_name,
+ void *data, struct vfsmount *mnt)
+{
+ struct exofs_mountopt opts;
+ int ret;
+
+ ret = parse_options(data, &opts);
+ if (ret)
+ return ret;
+
+ opts.dev_name = dev_name;
+ return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+
+/*
+ * Return information about the file system state in the buffer. This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct exofs_sb_info *sbi = sb->s_fs_info;
+ struct osd_obj_id obj = {sbi->s_pid, 0};
+ struct osd_attr attrs[] = {
+ ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+ OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+ ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+ OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+ };
+ uint64_t capacity = ULLONG_MAX;
+ uint64_t used = ULLONG_MAX;
+ struct osd_request *or;
+ uint8_t cred_a[OSD_CAP_LEN];
+ int ret;
+
+ /* get used/capacity attributes */
+ exofs_make_credential(cred_a, &obj);
+
+ or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+ if (unlikely(!or)) {
+ EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
+ return -ENOMEM;
+ }
+
+ osd_req_get_attributes(or, &obj);
+ osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+ ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+ if (unlikely(ret))
+ goto out;
+
+ ret = extract_attr_from_req(or, &attrs[0]);
+ if (likely(!ret))
+ capacity = get_unaligned_be64(attrs[0].val_ptr);
+ else
+ EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+
+ ret = extract_attr_from_req(or, &attrs[1]);
+ if (likely(!ret))
+ used = get_unaligned_be64(attrs[1].val_ptr);
+ else
+ EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+
+ /* fill in the stats buffer */
+ buf->f_type = EXOFS_SUPER_MAGIC;
+ buf->f_bsize = EXOFS_BLKSIZE;
+ buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+ buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+ buf->f_bavail = buf->f_bfree;
+ buf->f_files = sbi->s_numfiles;
+ buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+ buf->f_namelen = EXOFS_NAME_LEN;
+
+out:
+ osd_end_request(or);
+ return ret;
+}
+
+static const struct super_operations exofs_sops = {
+ .alloc_inode = exofs_alloc_inode,
+ .destroy_inode = exofs_destroy_inode,
+ .write_inode = exofs_write_inode,
+ .delete_inode = exofs_delete_inode,
+ .put_super = exofs_put_super,
+ .write_super = exofs_write_super,
+ .statfs = exofs_statfs,
+};
+
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+ unsigned long ino = exofs_parent_ino(child);
+
+ if (!ino)
+ return NULL;
+
+ return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct inode *inode;
+
+ inode = exofs_iget(sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (generation && inode->i_generation != generation) {
+ /* we didn't find the right inode.. */
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ return inode;
+}
+
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ exofs_nfs_get_inode);
+}
+
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ exofs_nfs_get_inode);
+}
+
+static const struct export_operations exofs_export_ops = {
+ .fh_to_dentry = exofs_fh_to_dentry,
+ .fh_to_parent = exofs_fh_to_parent,
+ .get_parent = exofs_get_parent,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+ .owner = THIS_MODULE,
+ .name = "exofs",
+ .get_sb = exofs_get_sb,
+ .kill_sb = generic_shutdown_super,
+};
+
+static int __init init_exofs(void)
+{
+ int err;
+
+ err = init_inodecache();
+ if (err)
+ goto out;
+
+ err = register_filesystem(&exofs_type);
+ if (err)
+ goto out_d;
+
+ return 0;
+out_d:
+ destroy_inodecache();
+out:
+ return err;
+}
+
+static void __exit exit_exofs(void)
+{
+ unregister_filesystem(&exofs_type);
+ destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 00000000000..36e2d7bc7f7
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+
+ nd_set_link(nd, (char *)oi->i_data);
+ return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = exofs_follow_link,
+};
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b2..d46e38cb85c 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
return PTR_ERR(acl);
}
if (!acl)
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
struct posix_acl *clone;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880..d81ef2fdb08 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
return PTR_ERR(acl);
}
if (!acl)
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
struct posix_acl *clone;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 5853f4440af..3d724a95882 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -42,7 +42,7 @@ const struct file_operations ext3_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = ext3_readdir, /* we take BKL. needed?*/
- .ioctl = ext3_ioctl, /* BKL held */
+ .unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
#endif
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 3be1e0689c9..521f8238b2f 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -112,7 +112,7 @@ const struct file_operations ext3_file_operations = {
.write = do_sync_write,
.aio_read = generic_file_aio_read,
.aio_write = ext3_file_write,
- .ioctl = ext3_ioctl,
+ .unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
#endif
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4a09ff16987..d3ef6566b01 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+ int ret;
handle_t *handle;
int retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
+ /* Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason */
+ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1184,15 +1187,20 @@ retry:
}
write_begin_failed:
if (ret) {
- ext3_journal_stop(handle);
- unlock_page(page);
- page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before truncate
+ * finishes.
*/
if (pos + len > inode->i_size)
+ ext3_orphan_add(handle, inode);
+ ext3_journal_stop(handle);
+ unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
return err;
}
+/* For ordered writepage and write_end functions */
+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * Write could have mapped the buffer but it didn't copy the data in
+ * yet. So avoid filing such buffer into a transaction.
+ */
+ if (buffer_mapped(bh) && buffer_uptodate(bh))
+ return ext3_journal_dirty_data(handle, bh);
+ return 0;
+}
+
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext3 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
- * after block_write_end.
+ * This is nasty and subtle: ext3_write_begin() could have allocated blocks
+ * for the whole page but later we failed to copy the data in. Update inode
+ * size according to what we managed to copy. The rest is going to be
+ * truncated in write_end function.
*/
-static int ext3_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
+ /* What matters to us is i_disksize. We don't write i_size anywhere */
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+ if (pos + copied > EXT3_I(inode)->i_disksize) {
+ EXT3_I(inode)->i_disksize = pos + copied;
mark_inode_dirty(inode);
}
-
- return copied;
}
/*
@@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
unsigned from, to;
int ret = 0, ret2;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + copied;
ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext3_journal_dirty_data);
+ from, to, NULL, journal_dirty_data_fn);
- if (ret == 0) {
- /*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT3_I(inode)->i_disksize)
- EXT3_I(inode)->i_disksize = new_i_size;
- ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
- }
+ if (ret == 0)
+ update_file_sizes(inode, pos, copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size)
+ ext3_orphan_add(handle, inode);
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}
@@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
{
handle_t *handle = ext3_journal_current_handle();
struct inode *inode = file->f_mapping->host;
- int ret = 0, ret2;
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT3_I(inode)->i_disksize)
- EXT3_I(inode)->i_disksize = new_i_size;
-
- ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
- page, fsdata);
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
+ int ret;
- ret2 = ext3_journal_stop(handle);
- if (!ret)
- ret = ret2;
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ update_file_sizes(inode, pos, copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size)
+ ext3_orphan_add(handle, inode);
+ ret = ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}
@@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
- page_zero_new_buffers(page, from+copied, to);
+ page_zero_new_buffers(page, from + copied, to);
+ to = from + copied;
}
ret = walk_page_buffers(handle, page_buffers(page), from,
to, &partial, write_end_fn);
if (!partial)
SetPageUptodate(page);
- if (pos+copied > inode->i_size)
- i_size_write(inode, pos+copied);
+
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+ /*
+ * There may be allocated blocks outside of i_size because
+ * we failed to copy some data. Prepare for truncate.
+ */
+ if (pos + len > inode->i_size)
+ ext3_orphan_add(handle, inode);
EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
if (inode->i_size > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}
@@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext3_journal_dirty_data(handle, bh);
- return 0;
-}
-
static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
{
return !buffer_mapped(bh);
}
+
/*
* Note that we always start a transaction even if we're not journalling
* data. This is to preserve ordering: any hole instantiation within
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 5e86ce9a86e..88974814783 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -15,12 +15,11 @@
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/compat.h>
-#include <linux/smp_lock.h>
#include <asm/uaccess.h>
-int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
- unsigned long arg)
+long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
+ struct inode *inode = filp->f_dentry->d_inode;
struct ext3_inode_info *ei = EXT3_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
@@ -39,29 +38,25 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned int oldflags;
unsigned int jflag;
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
err = mnt_want_write(filp->f_path.mnt);
if (err)
return err;
- if (!is_owner_or_cap(inode)) {
- err = -EACCES;
- goto flags_out;
- }
-
- if (get_user(flags, (int __user *) arg)) {
- err = -EFAULT;
- goto flags_out;
- }
-
flags = ext3_mask_flags(inode->i_mode, flags);
mutex_lock(&inode->i_mutex);
+
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- mutex_unlock(&inode->i_mutex);
- err = -EPERM;
+ err = -EPERM;
+ if (IS_NOQUOTA(inode))
goto flags_out;
- }
+
oldflags = ei->i_flags;
/* The JOURNAL_DATA flag is modifiable only by root */
@@ -74,11 +69,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* This test looks nicer. Thanks to Pauline Middelink
*/
if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- err = -EPERM;
+ if (!capable(CAP_LINUX_IMMUTABLE))
goto flags_out;
- }
}
/*
@@ -86,17 +78,12 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE)) {
- mutex_unlock(&inode->i_mutex);
- err = -EPERM;
+ if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
- }
}
-
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
- mutex_unlock(&inode->i_mutex);
err = PTR_ERR(handle);
goto flags_out;
}
@@ -116,15 +103,13 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
ext3_journal_stop(handle);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- return err;
- }
+ if (err)
+ goto flags_out;
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
err = ext3_change_inode_journal_flag(inode, jflag);
- mutex_unlock(&inode->i_mutex);
flags_out:
+ mutex_unlock(&inode->i_mutex);
mnt_drop_write(filp->f_path.mnt);
return err;
}
@@ -140,6 +125,7 @@ flags_out:
if (!is_owner_or_cap(inode))
return -EPERM;
+
err = mnt_want_write(filp->f_path.mnt);
if (err)
return err;
@@ -147,6 +133,7 @@ flags_out:
err = -EFAULT;
goto setversion_out;
}
+
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -299,9 +286,6 @@ group_add_out:
#ifdef CONFIG_COMPAT
long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- int ret;
-
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
case EXT3_IOC32_GETFLAGS:
@@ -341,9 +325,6 @@ long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
default:
return -ENOIOCTLCMD;
}
- lock_kernel();
- ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
- unlock_kernel();
- return ret;
+ return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e2fc63cbba8..6ddaa0a42b2 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(struct qstr *entry,
struct dx_frame *frame,
int *err);
static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
struct dx_hash_info *hinfo, struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
static int ext3_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
@@ -708,14 +708,14 @@ errout:
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
{
int count = 0;
char *base = (char *) de;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + size)
+ while ((char *) de < base + blocksize)
{
if (de->name_len && de->inode) {
ext3fs_dirhash(de->name, de->name_len, &h);
@@ -1047,8 +1047,16 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
return ERR_PTR(-EIO);
}
inode = ext3_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ if (unlikely(IS_ERR(inode))) {
+ if (PTR_ERR(inode) == -ESTALE) {
+ ext3_error(dir->i_sb, __func__,
+ "deleted inode referenced: %lu",
+ ino);
+ return ERR_PTR(-EIO);
+ } else {
+ return ERR_CAST(inode);
+ }
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1120,13 +1128,14 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
{
- struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
+ struct ext3_dir_entry_2 *next, *to, *prev;
+ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
unsigned rec_len = 0;
prev = to = de;
- while ((char*)de < base + size) {
+ while ((char *)de < base + blocksize) {
next = ext3_next_entry(de);
if (de->inode && de->name_len) {
rec_len = EXT3_DIR_REC_LEN(de->name_len);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc..647e0d65a28 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
return PTR_ERR(acl);
}
if (!acl)
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
struct posix_acl *clone;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899..53c72ad8587 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
}
static int ext4_group_used_meta_blocks(struct super_block *sb,
- ext4_group_t block_group)
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
ext4_fsblk_t tmp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
int used_blocks = sbi->s_itb_per_group + 2;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
- struct ext4_group_desc *gdp;
- struct buffer_head *bh;
-
- gdp = ext4_get_group_desc(sb, block_group, &bh);
if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
block_group))
used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
}
- return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+ return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
}
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_add(blocks_freed,
+ &sbi->s_flex_groups[flex_group].free_blocks);
}
/*
* request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01a..b64789929a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
unsigned int offset)
{
const char *error_msg = NULL;
- const int rlen = ext4_rec_len_from_disk(de->rec_len);
+ const int rlen = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
if (rlen < EXT4_DIR_REC_LEN(1))
error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len)
- < EXT4_DIR_REC_LEN(1))
+ if (ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
break;
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = i;
filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
ret = stored;
goto out;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
@@ -225,7 +228,8 @@ revalidate:
goto revalidate;
stored++;
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+ filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = 0;
brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057..d0f15ef56de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
#undef EXT4FS_DEBUG
/*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS 8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS 1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
* Debug code
*/
#ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
#define ext4_debug(f, a...) do {} while (0)
#endif
-#define EXT4_MULTIBLOCK_ALLOCATOR 1
-
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 1
/* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
*/
struct flex_groups {
- __u32 free_inodes;
- __u32 free_blocks;
+ atomic_t free_inodes;
+ atomic_t free_blocks;
+ atomic_t used_dirs;
};
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+ EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+ EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT4_REG_FLMASK;
+ else
+ return flags & EXT4_OTHER_FLMASK;
+}
+
/*
* Inode dynamic state flags
*/
@@ -256,6 +271,7 @@ struct flex_groups {
#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
/*
* ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do { \
#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
__u8 s_reserved_char_pad2;
__le16 s_reserved_pad;
- __u32 s_reserved[162]; /* Padding to the end of the block */
+ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
+ __u32 s_reserved[160]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+
+/*
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN ((1<<16)-1)
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
- unsigned len = le16_to_cpu(dlen);
-
- if (len == EXT4_MAX_REC_LEN || len == 0)
- return 1 << 16;
- return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
- if (len == (1 << 16))
- return cpu_to_le16(EXT4_MAX_REC_LEN);
- else if (len > (1 << 16))
- BUG();
- return cpu_to_le16(len);
-}
-
/*
* Hash Tree Directory indexing
* (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
extern struct proc_dir_entry *ext4_proc_root;
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define EXT4_PROC_HANDLER(name, var) \
-do { \
- proc = proc_create_data(name, mode, sbi->s_proc, \
- &ext4_ui_proc_fops, &sbi->s_##var); \
- if (proc == NULL) { \
- printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
- goto err_out; \
- } \
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
/*
* Function prototypes
*/
@@ -1092,13 +1083,14 @@ extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
/* ioctl.c */
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
+
/* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbb..f0c3ec85bd4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
ext4_lblk_t *, ext4_fsblk_t *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c..4ce2187123a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
/*
* storage for cached extent
*/
@@ -125,6 +122,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a04..57b71fefbcc 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyblocks_counter;
- struct blockgroup_lock s_blockgroup_lock;
+ struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
-
- /* root of the per fs reservation window tree */
- spinlock_t s_rsv_window_lock;
- struct rb_root s_rsv_window_root;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
/* Journaling */
struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+ /* for write statistics */
+ unsigned long s_sectors_written_start;
+ u64 s_kbytes_written;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
static inline spinlock_t *
sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
{
- return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
}
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f59..ac77d8b8251 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
int depth;
if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}
/* OK. use inode's group */
- bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
return max;
}
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+ ext4_fsblk_t block = ext_pblock(ext);
+ int len = ext4_ext_get_actual_len(ext);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+ ((block + len) > ext4_blocks_count(es))))
+ return 0;
+ else
+ return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+ struct ext4_extent_idx *ext_idx)
+{
+ ext4_fsblk_t block = idx_pblock(ext_idx);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+ (block > ext4_blocks_count(es))))
+ return 0;
+ else
+ return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+ struct ext4_extent_header *eh,
+ int depth)
+{
+ struct ext4_extent *ext;
+ struct ext4_extent_idx *ext_idx;
+ unsigned short entries;
+ if (eh->eh_entries == 0)
+ return 1;
+
+ entries = le16_to_cpu(eh->eh_entries);
+
+ if (depth == 0) {
+ /* leaf entries */
+ ext = EXT_FIRST_EXTENT(eh);
+ while (entries) {
+ if (!ext4_valid_extent(inode, ext))
+ return 0;
+ ext++;
+ entries--;
+ }
+ } else {
+ ext_idx = EXT_FIRST_INDEX(eh);
+ while (entries) {
+ if (!ext4_valid_extent_idx(inode, ext_idx))
+ return 0;
+ ext_idx++;
+ entries--;
+ }
+ }
+ return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
struct ext4_extent_header *eh,
int depth)
{
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ error_msg = "invalid extent entries";
+ goto corrupted;
+ }
return 0;
corrupted:
ext4_error(inode->i_sb, function,
- "bad header in inode #%lu: %s - magic %x, "
+ "bad header/extent in inode #%lu: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
return -EIO;
}
-#define ext4_ext_check_header(inode, eh, depth) \
- __ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth) \
+ __ext4_ext_check(__func__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
- if (ext4_ext_check_header(inode, eh, depth))
- return ERR_PTR(-EIO);
-
/* account possible depth increase */
if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
i = depth;
/* walk through the tree */
while (i) {
+ int need_to_validate = 0;
+
ext_debug("depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_bread(inode->i_sb, path[ppos].p_block);
- if (!bh)
+ bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+ if (unlikely(!bh))
goto err;
-
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto err;
+ }
+ /* validate the extent entries */
+ need_to_validate = 1;
+ }
eh = ext_block_hdr(bh);
ppos++;
BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_hdr = eh;
i--;
- if (ext4_ext_check_header(inode, eh, i))
+ if (need_to_validate && ext4_ext_check(inode, eh, i))
goto err;
}
@@ -1181,7 +1276,7 @@ got_index:
return -EIO;
eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
put_bh(bh);
return -EIO;
}
@@ -1194,7 +1289,7 @@ got_index:
if (bh == NULL)
return -EIO;
eh = ext_block_hdr(bh);
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
put_bh(bh);
return -EIO;
}
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
return -ENOMEM;
}
path[0].p_hdr = ext_inode_hdr(inode);
- if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+ if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
err = -EIO;
break;
}
- if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+ if (ext4_ext_check(inode, ext_block_hdr(bh),
depth - i - 1)) {
err = -EIO;
break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a0..588af8c7724 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
+ if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+ ext4_alloc_da_blocks(inode);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+ }
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks)
{
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8..47b84e8df56 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
- ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+ atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+ }
+
}
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
percpu_counter_dec(&sbi->s_dirs_counter);
if (sbi->s_log_groups_per_flex) {
- flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_inodes++;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
}
}
BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
sbi->s_log_groups_per_flex;
find_close_to_parent:
- flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
- if (flex_group[best_flex].free_inodes &&
+ if (atomic_read(&flex_group[best_flex].free_inodes) &&
flex_freeb_ratio > free_block_ratio)
goto found_flexbg;
@@ -375,24 +381,24 @@ find_close_to_parent:
if (i == parent_fbg_group || i == parent_fbg_group - 1)
continue;
- flexbg_free_blocks = flex_group[i].free_blocks;
+ flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
if (flex_freeb_ratio > free_block_ratio &&
- flex_group[i].free_inodes) {
+ (atomic_read(&flex_group[i].free_inodes))) {
best_flex = i;
goto found_flexbg;
}
- if (flex_group[best_flex].free_inodes == 0 ||
- (flex_group[i].free_blocks >
- flex_group[best_flex].free_blocks &&
- flex_group[i].free_inodes))
+ if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
+ ((atomic_read(&flex_group[i].free_blocks) >
+ atomic_read(&flex_group[best_flex].free_blocks)) &&
+ atomic_read(&flex_group[i].free_inodes)))
best_flex = i;
}
- if (!flex_group[best_flex].free_inodes ||
- !flex_group[best_flex].free_blocks)
+ if (!atomic_read(&flex_group[best_flex].free_inodes) ||
+ !atomic_read(&flex_group[best_flex].free_blocks))
return -1;
found_flexbg:
@@ -410,6 +416,42 @@ out:
return 0;
}
+struct orlov_stats {
+ __u32 free_inodes;
+ __u32 free_blocks;
+ __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg. If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
+{
+ struct ext4_group_desc *desc;
+ struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+ if (flex_size > 1) {
+ stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+ stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+ stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ return;
+ }
+
+ desc = ext4_get_group_desc(sb, g, NULL);
+ if (desc) {
+ stats->free_inodes = ext4_free_inodes_count(sb, desc);
+ stats->free_blocks = ext4_free_blks_count(sb, desc);
+ stats->used_dirs = ext4_used_dirs_count(sb, desc);
+ } else {
+ stats->free_inodes = 0;
+ stats->free_blocks = 0;
+ stats->used_dirs = 0;
+ }
+}
+
/*
* Orlov's allocator for directories.
*
@@ -425,35 +467,34 @@ out:
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
*/
-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
ext4_group_t ngroups = sbi->s_groups_count;
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
- ext4_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i;
+ ext4_group_t i, grp, g;
struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+
+ if (flex_size > 1) {
+ ngroups = (ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
do_div(avefreeb, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
- if ((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
int best_ndir = inodes_per_group;
- ext4_group_t grp;
int ret = -1;
get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
- grp = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
continue;
- if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+ if (stats.used_dirs >= best_ndir)
continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
+ if (stats.free_inodes < avefreei)
continue;
- if (ext4_free_blks_count(sb, desc) < avefreeb)
+ if (stats.free_blocks < avefreeb)
continue;
- *group = grp;
+ grp = g;
ret = 0;
- best_ndir = ext4_used_dirs_count(sb, desc);
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ /*
+ * We pack inodes at the beginning of the flexgroup's
+ * inode tables. Block allocation decisions will do
+ * something similar, although regular files will
+ * start at 2nd block group of the flexgroup. See
+ * ext4_ext_find_goal() and ext4_find_near().
+ */
+ grp *= flex_size;
+ for (i = 0; i < flex_size; i++) {
+ if (grp+i >= sbi->s_groups_count)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
}
- if (ret == 0)
- return ret;
goto fallback;
}
- blocks_per_dir = ext4_blocks_count(es) - freeb;
- do_div(blocks_per_dir, ndirs);
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
- max_debt = EXT4_BLOCKS_PER_GROUP(sb);
- max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+ /*
+ * Start looking in the flex group where we last allocated an
+ * inode for this parent directory
+ */
+ if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ if (flex_size > 1)
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
- continue;
- if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
continue;
- if (ext4_free_inodes_count(sb, desc) < min_inodes)
+ if (stats.free_inodes < min_inodes)
continue;
- if (ext4_free_blks_count(sb, desc) < min_blocks)
+ if (stats.free_blocks < min_blocks)
continue;
- return 0;
+ goto found_flex_bg;
}
fallback:
+ ngroups = sbi->s_groups_count;
+ avefreei = freei / ngroups;
+ parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei)
+ ext4_free_inodes_count(sb, desc) >= avefreei) {
+ *group = grp;
return 0;
+ }
}
if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
- ext4_group_t i;
+ ext4_group_t i, last;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * Try to place the inode is the same flex group as its
+ * parent. If we can't find space, use the Orlov algorithm to
+ * find another flex group, and store that information in the
+ * parent directory's inode information so that use that flex
+ * group for future allocations.
+ */
+ if (flex_size > 1) {
+ int retry = 0;
+
+ try_again:
+ parent_group &= ~(flex_size-1);
+ last = parent_group + flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ for (i = parent_group; i < last; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search algorithm
+ * to find a new flex group; we pass in the mode to
+ * avoid the topdir algorithms.
+ */
+ *group = parent_group + flex_size;
+ if (*group > ngroups)
+ *group = 0;
+ return find_group_orlov(sb, parent, group, mode);
+ }
/*
* Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
if (S_ISDIR(mode)) {
count = ext4_used_dirs_count(sb, gdp) + 1;
ext4_used_dirs_set(sb, gdp, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ }
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
- if (sbi->s_log_groups_per_flex) {
+ if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == 0 && once)
once = 0;
printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group);
+ ret2 = find_group_orlov(sb, dir, &group, mode);
} else
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
got_group:
+ EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -858,9 +970,7 @@ got:
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_inodes--;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
ei->i_disksize = 0;
/*
- * Don't inherit extent flag from directory. We set extent flag on
- * newly created directory and file only if -o extent mount option is
- * specified
+ * Don't inherit extent flag from directory, amongst others. We set
+ * extent flag on newly created directory and file only if -o extent
+ * mount option is specified
*/
- ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
- /* dirsync only applies to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT4_DIRSYNC_FL;
+ ei->i_flags =
+ ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db7..a2e7952bc5f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
return n;
}
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+ unsigned int *p, unsigned int max) {
+
+ unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+ unsigned int *bref = p;
+ while (bref < p+max) {
+ if (unlikely(*bref >= maxblocks)) {
+ ext4_error(inode->i_sb, function,
+ "block reference %u >= max (%u) "
+ "in inode #%lu, offset=%d",
+ *bref, maxblocks,
+ inode->i_ino, (int)(bref-p));
+ return -EIO;
+ }
+ bref++;
+ }
+ return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh) \
+ __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
+ EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode) \
+ __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
+ EXT4_NDIR_BLOCKS)
+
/**
* ext4_get_branch - read the chain of indirect blocks leading to data
* @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
+ bh = sb_getblk(sb, le32_to_cpu(p->key));
+ if (unlikely(!bh))
goto failure;
+
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto failure;
+ }
+ /* validate block references */
+ if (ext4_check_indirect_blockref(inode, bh)) {
+ put_bh(bh);
+ goto failure;
+ }
+ }
+
add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
/* Reader: end */
if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
/* Try to find previous block */
for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
* It is going to be referred to from the inode itself? OK, just put it
* into the same cylinder group then.
*/
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
/*
* free those over-booking quota for metadata blocks
*/
-
if (mdb_free)
vfs_dq_release_reservation_block(inode, mdb_free);
+
+ /*
+ * If we have done all the pending block allocations and if
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
+ */
+ if (!total && (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
}
/*
@@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
struct mpage_da_data {
struct inode *inode;
- struct buffer_head lbh; /* extent of blocks */
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
unsigned long first_page, next_page; /* extent of pages */
- get_block_t *get_block;
struct writeback_control *wbc;
int io_done;
int pages_written;
@@ -1704,7 +1768,6 @@ struct mpage_da_data {
* @mpd->inode: inode
* @mpd->first_page: first page of the extent
* @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
*
* By the time mpage_da_submit_io() is called we expect all blocks
* to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
/*
* We need to start from the first_page to the next_page - 1
* to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->lbh.b_blocknr we would only be looking
+ * If we look at mpd->b_blocknr we would only be looking
* at the currently mapped buffer_heads.
*/
index = mpd->first_page;
@@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ if (ret <= 0)
+ return ret;
+
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ if (ext4_should_order_data(inode)) {
+ int retval;
+ retval = ext4_jbd2_file_inode(handle, inode);
+ if (retval)
+ /*
+ * Failed to add inode for ordered mode. Don't
+ * update file size
+ */
+ return retval;
+ }
+
+ /*
+ * Update on-disk size along with block allocation we don't
+ * use 'extend_disksize' as size may change within already
+ * allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, disksize);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ return 0;
+}
+
/*
* mpage_da_map_blocks - go through given space
*
- * @mpd->lbh - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
+ * @mpd - bh describing space
*
* The function skips space we know is already mapped to disk blocks.
*
*/
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err = 0;
struct buffer_head new;
- struct buffer_head *lbh = &mpd->lbh;
sector_t next;
/*
* We consider only non-mapped and non-allocated blocks
*/
- if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ if ((mpd->b_state & (1 << BH_Mapped)) &&
+ !(mpd->b_state & (1 << BH_Delay)))
return 0;
- new.b_state = lbh->b_state;
+ new.b_state = mpd->b_state;
new.b_blocknr = 0;
- new.b_size = lbh->b_size;
- next = lbh->b_blocknr;
+ new.b_size = mpd->b_size;
+ next = mpd->b_blocknr;
/*
* If we didn't accumulate anything
* to write simply return
*/
if (!new.b_size)
return 0;
- err = mpd->get_block(mpd->inode, next, &new, 1);
- if (err) {
- /* If get block returns with error
- * we simply return. Later writepage
- * will redirty the page and writepages
- * will find the dirty page again
+ err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * If get block returns with error we simply
+ * return. Later writepage will redirty the page and
+ * writepages will find the dirty page again
*/
if (err == -EAGAIN)
return 0;
if (err == -ENOSPC &&
- ext4_count_free_blocks(mpd->inode->i_sb)) {
+ ext4_count_free_blocks(mpd->inode->i_sb)) {
mpd->retval = err;
return 0;
}
/*
- * get block failure will cause us
- * to loop in writepages. Because
- * a_ops->writepage won't be able to
- * make progress. The page will be redirtied
- * by writepage and writepages will again
- * try to write the same.
+ * get block failure will cause us to loop in
+ * writepages, because a_ops->writepage won't be able
+ * to make progress. The page will be redirtied by
+ * writepage and writepages will again try to write
+ * the same.
*/
printk(KERN_EMERG "%s block allocation failed for inode %lu "
"at logical offset %llu with max blocks "
"%zd with error %d\n",
__func__, mpd->inode->i_ino,
(unsigned long long)next,
- lbh->b_size >> mpd->inode->i_blkbits, err);
+ mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
if (err == -ENOSPC) {
@@ -1983,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
}
/* invlaidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
- lbh->b_size >> mpd->inode->i_blkbits);
+ mpd->b_size >> mpd->inode->i_blkbits);
return err;
}
BUG_ON(new.b_size == 0);
@@ -1995,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* If blocks are delayed marked, we need to
* put actual blocknr and drop delayed bit
*/
- if (buffer_delay(lbh) || buffer_unwritten(lbh))
+ if ((mpd->b_state & (1 << BH_Delay)) ||
+ (mpd->b_state & (1 << BH_Unwritten)))
mpage_put_bnr_to_bhs(mpd, next, &new);
return 0;
@@ -2014,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* the function is used to collect contig. blocks in same state
*/
static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, struct buffer_head *bh)
+ sector_t logical, size_t b_size,
+ unsigned long b_state)
{
sector_t next;
- size_t b_size = bh->b_size;
- struct buffer_head *lbh = &mpd->lbh;
- int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+ int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
/* check if thereserved journal credits might overflow */
if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
/*
* First block in the extent
*/
- if (lbh->b_size == 0) {
- lbh->b_blocknr = logical;
- lbh->b_size = b_size;
- lbh->b_state = bh->b_state & BH_FLAGS;
+ if (mpd->b_size == 0) {
+ mpd->b_blocknr = logical;
+ mpd->b_size = b_size;
+ mpd->b_state = b_state & BH_FLAGS;
return;
}
- next = lbh->b_blocknr + nrblocks;
+ next = mpd->b_blocknr + nrblocks;
/*
* Can we merge the block to our big extent?
*/
- if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
- lbh->b_size += b_size;
+ if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+ mpd->b_size += b_size;
return;
}
@@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
{
struct mpage_da_data *mpd = data;
struct inode *inode = mpd->inode;
- struct buffer_head *bh, *head, fake;
+ struct buffer_head *bh, *head;
sector_t logical;
if (mpd->io_done) {
@@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
/*
* ... and blocks
*/
- mpd->lbh.b_size = 0;
- mpd->lbh.b_state = 0;
- mpd->lbh.b_blocknr = 0;
+ mpd->b_size = 0;
+ mpd->b_state = 0;
+ mpd->b_blocknr = 0;
}
mpd->next_page = page->index + 1;
@@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!page_has_buffers(page)) {
- /*
- * There is no attached buffer heads yet (mmap?)
- * we treat the page asfull of dirty blocks
- */
- bh = &fake;
- bh->b_size = PAGE_CACHE_SIZE;
- bh->b_state = 0;
- set_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- mpage_add_bh_to_extent(mpd, logical, bh);
+ mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
} else {
@@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
* with the page in ext4_da_writepage
*/
if (buffer_dirty(bh) &&
- (!buffer_mapped(bh) || buffer_delay(bh))) {
- mpage_add_bh_to_extent(mpd, logical, bh);
+ (!buffer_mapped(bh) || buffer_delay(bh))) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_size,
+ bh->b_state);
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
* unmapped buffer_head later we need to
* use the b_state flag of that buffer_head.
*/
- if (mpd->lbh.b_size == 0)
- mpd->lbh.b_state =
- bh->b_state & BH_FLAGS;
+ if (mpd->b_size == 0)
+ mpd->b_state = bh->b_state & BH_FLAGS;
}
logical++;
} while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
}
/*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd)
-{
- int ret;
-
- if (!mpd->get_block)
- return generic_writepages(mapping, wbc);
-
- mpd->lbh.b_size = 0;
- mpd->lbh.b_state = 0;
- mpd->lbh.b_blocknr = 0;
- mpd->first_page = 0;
- mpd->next_page = 0;
- mpd->io_done = 0;
- mpd->pages_written = 0;
- mpd->retval = 0;
-
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
- /*
- * Handle last extent of pages
- */
- if (!mpd->io_done && mpd->next_page != mpd->first_page) {
- if (mpage_da_map_blocks(mpd) == 0)
- mpage_da_submit_io(mpd);
-
- mpd->io_done = 1;
- ret = MPAGE_DA_EXTENT_TAIL;
- }
- wbc->nr_to_write -= mpd->pages_written;
- return ret;
-}
-
-/*
* this is a special callback for ->write_begin() only
* it's intention is to return mapped block or reserve space
*/
@@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
}
-#define EXT4_DELALLOC_RSVED 1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- loff_t disksize = EXT4_I(inode)->i_disksize;
- handle_t *handle = NULL;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
- ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
- bh_result, create, 0, EXT4_DELALLOC_RSVED);
- if (ret > 0) {
-
- bh_result->b_size = (ret << inode->i_blkbits);
-
- if (ext4_should_order_data(inode)) {
- int retval;
- retval = ext4_jbd2_file_inode(handle, inode);
- if (retval)
- /*
- * Failed to add inode for ordered
- * mode. Don't update file size
- */
- return retval;
- }
-
- /*
- * Update on-disk size along with block allocation
- * we don't use 'extend_disksize' as size may change
- * within already allocated block -bzzz
- */
- disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
- if (disksize > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, disksize);
- ret = ext4_mark_inode_dirty(handle, inode);
- return ret;
- }
- ret = 0;
- }
- return ret;
-}
static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
{
@@ -2569,8 +2578,38 @@ retry:
dump_stack();
goto out_writepages;
}
- mpd.get_block = ext4_da_get_block_write;
- ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+ /*
+ * Now call __mpage_da_writepage to find the next
+ * contiguous region of logical blocks that need
+ * blocks to be allocated by ext4. We don't actually
+ * submit the blocks for I/O here, even though
+ * write_cache_pages thinks it will, and will set the
+ * pages as clean for write before calling
+ * __mpage_da_writepage().
+ */
+ mpd.b_size = 0;
+ mpd.b_state = 0;
+ mpd.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.io_done = 0;
+ mpd.pages_written = 0;
+ mpd.retval = 0;
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+ &mpd);
+ /*
+ * If we have a contigous extent of pages and we
+ * haven't done the I/O yet, map the blocks and submit
+ * them for I/O.
+ */
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+ if (mpage_da_map_blocks(&mpd) == 0)
+ mpage_da_submit_io(&mpd);
+ mpd.io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
@@ -2846,6 +2885,48 @@ out:
return;
}
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+ if (!EXT4_I(inode)->i_reserved_data_blocks &&
+ !EXT4_I(inode)->i_reserved_meta_blocks)
+ return 0;
+
+ /*
+ * We do something simple for now. The filemap_flush() will
+ * also start triggering a write of the data blocks, which is
+ * not strictly speaking necessary (and for users of
+ * laptop_mode, not even desirable). However, to do otherwise
+ * would require replicating code paths in:
+ *
+ * ext4_da_writepages() ->
+ * write_cache_pages() ---> (via passed in callback function)
+ * __mpage_da_writepage() -->
+ * mpage_add_bh_to_extent()
+ * mpage_da_map_blocks()
+ *
+ * The problem is that write_cache_pages(), located in
+ * mm/page-writeback.c, marks pages clean in preparation for
+ * doing I/O, which is not desirable if we're not planning on
+ * doing I/O at all.
+ *
+ * We could call write_cache_pages(), and then redirty all of
+ * the pages by calling redirty_page_for_writeback() but that
+ * would be ugly in the extreme. So instead we would need to
+ * replicate parts of the code in the above functions,
+ * simplifying them becuase we wouldn't actually intend to
+ * write out the pages, but rather only collect contiguous
+ * logical block extents, call the multi-block allocator, and
+ * then update the buffer heads with the block allocations.
+ *
+ * For now, though, we'll cheat by calling filemap_flush(),
+ * which will map the blocks, and start the I/O, but not
+ * actually wait for the I/O to complete.
+ */
+ return filemap_flush(inode->i_mapping);
+}
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
ext4_ext_truncate(inode);
return;
@@ -4110,12 +4194,7 @@ make_io:
unsigned num;
table = ext4_inode_table(sb, gdp);
- /* Make sure s_inode_readahead_blks is a power of 2 */
- while (EXT4_SB(sb)->s_inode_readahead_blks &
- (EXT4_SB(sb)->s_inode_readahead_blks-1))
- EXT4_SB(sb)->s_inode_readahead_blks =
- (EXT4_SB(sb)->s_inode_readahead_blks &
- (EXT4_SB(sb)->s_inode_readahead_blks-1));
+ /* s_inode_readahead_blks is always a power of 2 */
b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
if (table > b)
b = table;
@@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
}
+ if (ei->i_flags & EXT4_EXTENTS_FL) {
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_check_inode_blockref(inode);
+ }
+ if (ret) {
+ brelse(bh);
+ goto bad_inode;
+ }
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else {
+ brelse(bh);
+ ret = -EIO;
+ ext4_error(inode->i_sb, __func__,
+ "bogus i_mode (%o) for inode=%lu",
+ inode->i_mode, inode->i_ino);
+ goto bad_inode;
}
brelse(iloc.bh);
ext4_set_inode_flags(inode);
@@ -5146,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
return !buffer_mapped(bh);
}
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
loff_t size;
unsigned long len;
int ret = -EINVAL;
@@ -5199,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
goto out_unlock;
ret = 0;
out_unlock:
+ if (ret)
+ ret = VM_FAULT_SIGBUS;
up_read(&inode->i_alloc_sem);
return ret;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247..91e75f7a9e7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (err)
return err;
- if (!S_ISDIR(inode->i_mode))
- flags &= ~EXT4_DIRSYNC_FL;
+ flags = ext4_mask_flags(inode->i_mode, flags);
err = -EPERM;
mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
return err;
}
+ case EXT4_IOC_ALLOC_DA_BLKS:
+ {
+ int err;
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ err = ext4_alloc_da_blocks(inode);
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
+
default:
return -ENOTTY;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd03..f871677a798 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
* The allocation request involve request for multiple number of blocks
* near to the goal(block) value specified.
*
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
*
* The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
*
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
* 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
* stripe value (sbi->s_stripe)
*
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
*
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
*
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
* value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
* stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
* min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
* best extent in the found extents. Searching for the blocks starts with
* the group specified as the goal value in allocation context via
* ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
{
unsigned free, fragments;
unsigned i, bits;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_desc *desc;
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
return 0;
+ /* Avoid using the first bg of a flexgroup for data files */
+ if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+ (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+ ((group % flex_size) == 0))
+ return 0;
+
bits = ac->ac_sb->s_blocksize_bits + 1;
for (i = ac->ac_2order; i <= bits; i++)
if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/*
* We search using buddy data only if the order of the request
* is greater than equal to the sbi_s_mb_order2_reqs
- * You can tune it via /proc/fs/ext4/<partition>/order2_req
+ * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
*/
if (i >= sbi->s_mb_order2_reqs) {
/*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
- kfree(sbi->s_mb_maxs);
+ kfree(sbi->s_mb_offsets);
return -ENOMEM;
}
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
- ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
free_percpu(sbi->s_locality_groups);
ext4_mb_history_release(sb);
- ext4_mb_destroy_per_dev_proc(sb);
return 0;
}
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
-#define EXT4_MB_STATS_NAME "stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
-#define EXT4_MB_ORDER2_REQ "order2_req"
-#define EXT4_MB_STREAM_REQ "stream_req"
-#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
- mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct proc_dir_entry *proc;
-
- if (sbi->s_proc == NULL)
- return -EINVAL;
-
- EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
- EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
- EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
- EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
- EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
- EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
- return 0;
-
-err_out:
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
- return -ENOMEM;
-#else
- return 0;
-#endif
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_proc == NULL)
- return -EINVAL;
-
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
- return 0;
-}
-
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_blocks);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
* here we normalize request for locality group
* Group request are normalized to s_strip size if we set the same via mount
* option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
*
* XXX: should we try to preallocate more than the group has now?
*/
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
- /* If linear, pa_pstart may be in the next group when pa is used up */
- if (pa->pa_linear)
+ /*
+ * If doing group-based preallocation, pa_pstart may be in the
+ * next group when pa is used up
+ */
+ if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
- pa->pa_linear = 0;
+ pa->pa_type = MB_INODE_PA;
mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
- pa->pa_linear = 1;
+ pa->pa_type = MB_GROUP_PA;
mb_debug("new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
- if (pa->pa_linear)
+ if (pa->pa_type == MB_GROUP_PA)
ext4_mb_release_group_pa(&e4b, pa, ac);
else
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
spin_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
- BUG_ON(pa->pa_linear != 0);
+ BUG_ON(pa->pa_type != MB_INODE_PA);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
* file is determined by the current size or the resulting size after
* allocation which ever is larger
*
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
*/
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
continue;
}
/* only lg prealloc space */
- BUG_ON(!pa->pa_linear);
+ BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
- spin_unlock(&pa->pa_lock);
+ spin_unlock(&tmp_pa->pa_lock);
continue;
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
- if (pa->pa_linear) {
+ if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
* doesn't grow big. We need to release
* alloc_semp before calling ext4_mb_add_n_trim()
*/
- if (pa->pa_linear && likely(pa->pa_free)) {
+ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += count;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
}
ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf1..dd9e6cd5f6c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
ext4_lblk_t pa_lstart; /* log. block */
unsigned short pa_len; /* len of preallocated chunk */
unsigned short pa_free; /* how many blocks are free */
- unsigned short pa_linear; /* consumed in one direction
- * strictly, for grp prealloc */
+ unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
};
+enum {
+ MB_INODE_PA = 0,
+ MB_GROUP_PA = 1
+};
struct ext4_free_extent {
ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3e..22098e1cd08 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
struct dx_frame *frame,
int *err);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
struct dx_hash_info *hinfo, struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
- struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+ struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+ if (len == EXT4_MAX_REC_LEN || len == 0)
+ return blocksize;
+ return (len & 65532) | ((len & 3) << 16);
+}
+
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+ if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+ BUG();
+ if (len < 65536)
+ return cpu_to_le16(len);
+ if (len == blocksize) {
+ if (blocksize == 65536)
+ return cpu_to_le16(EXT4_MAX_REC_LEN);
+ else
+ return cpu_to_le16(0);
+ }
+ return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
+
/*
* p is at least 6 bytes before the end of page
*/
static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
{
return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
+ ext4_rec_len_from_disk(p->rec_len, blocksize));
}
/*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
}
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, size);
}
printk("(%i)\n", names);
return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de)) {
+ for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
goto errout;
count++;
@@ -713,15 +737,15 @@ errout:
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail)
{
int count = 0;
char *base = (char *) de;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + size)
- {
+ while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
ext4fs_dirhash(de->name, de->name_len, &h);
map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
cond_resched();
}
/* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
return 1;
}
/* prevent looping on a bad block */
- de_len = ext4_rec_len_from_disk(de->rec_len);
+ de_len = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
if (de_len <= 0)
return -1;
offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de)) {
+ for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ ((char *) de - bh->b_data);
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ if (unlikely(IS_ERR(inode))) {
+ if (PTR_ERR(inode) == -ESTALE) {
+ ext4_error(dir->i_sb, __func__,
+ "deleted inode referenced: %u",
+ ino);
+ return ERR_PTR(-EIO);
+ } else {
+ return ERR_CAST(inode);
+ }
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
* Returns pointer to last entry moved.
*/
static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+ unsigned blocksize)
{
unsigned rec_len = 0;
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
rec_len = EXT4_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
- ext4_rec_len_to_disk(rec_len);
+ ext4_rec_len_to_disk(rec_len, blocksize);
de->inode = 0;
map++;
to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
{
struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
unsigned rec_len = 0;
prev = to = de;
- while ((char*)de < base + size) {
- next = ext4_next_entry(de);
+ while ((char*)de < base + blocksize) {
+ next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
rec_len = EXT4_DIR_REC_LEN(de->name_len);
if (de > to)
memmove(to, de, rec_len);
- to->rec_len = ext4_rec_len_to_disk(rec_len);
+ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
prev = to;
to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
}
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split);
+ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
de = dx_pack_dirents(data1, blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
- de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
+ de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+ blocksize);
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned int offset = 0;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
unsigned short reclen;
int nlen, rlen, err;
char *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+ top = bh->b_data + blocksize - reclen;
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
return -EEXIST;
}
nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if ((de->inode? rlen - nlen: rlen) >= reclen)
break;
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
/* By now the buffer is marked for journaling */
nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if (de->inode) {
struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
- de->rec_len = ext4_rec_len_to_disk(nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
de = de1;
}
de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
- ext4_rec_len_from_disk(fde->rec_len));
+ ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
ext4_error(dir->i_sb, __func__,
"invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
memcpy (data1, de, len);
de = (struct ext4_dir_entry_2 *) data1;
top = data1 + len;
- while ((char *)(de2 = ext4_next_entry(de)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+ blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
- de->rec_len = ext4_rec_len_to_disk(blocksize);
+ de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
- node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+ node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+ sb->s_blocksize);
node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
struct buffer_head *bh)
{
struct ext4_dir_entry_2 *de, *pde;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
int i;
i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
ext4_journal_get_write_access(handle, bh);
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
- ext4_rec_len_from_disk(pde->rec_len) +
- ext4_rec_len_from_disk(de->rec_len));
+ ext4_rec_len_from_disk(pde->rec_len,
+ blocksize) +
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize),
+ blocksize);
else
de->inode = 0;
dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
ext4_handle_dirty_metadata(handle, dir, bh);
return 0;
}
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len, blocksize);
pde = de;
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
}
return -ENOENT;
}
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct inode *inode;
struct buffer_head *dir_block;
struct ext4_dir_entry_2 *de;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
int err, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
de = (struct ext4_dir_entry_2 *) dir_block->b_data;
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+ de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ blocksize);
strcpy(de->name, ".");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
- EXT4_DIR_REC_LEN(1));
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+ blocksize);
de->name_len = 2;
strcpy(de->name, "..");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
return 1;
}
de = (struct ext4_dir_entry_2 *) bh->b_data;
- de1 = ext4_next_entry(de);
+ de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
!le32_to_cpu(de1->inode) ||
strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
brelse(bh);
return 1;
}
- offset = ext4_rec_len_from_disk(de->rec_len) +
- ext4_rec_len_from_disk(de1->rec_len);
- de = ext4_next_entry(de1);
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+ ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
if (!bh ||
(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
brelse(bh);
return 0;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
- de = ext4_next_entry(de);
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
}
brelse(bh);
return 1;
@@ -2297,8 +2343,8 @@ retry:
return err;
}
-#define PARENT_INO(buffer) \
- (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+#define PARENT_INO(buffer, size) \
+ (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
/*
* Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
- int retval;
+ int retval, force_da_alloc = 0;
old_bh = new_bh = dir_bh = NULL;
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
goto end_rename;
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+ old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (dir_bh) {
BUFFER_TRACE(dir_bh, "get_write_access");
ext4_journal_get_write_access(handle, dir_bh);
- PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+ PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+ cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext4_orphan_add(handle, new_inode);
+ if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+ force_da_alloc = 1;
}
retval = 0;
@@ -2457,6 +2507,8 @@ end_rename:
brelse(old_bh);
brelse(new_bh);
ext4_journal_stop(handle);
+ if (retval == 0 && force_da_alloc)
+ ext4_alloc_da_blocks(old_inode);
return retval;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd65..546c7dd869e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
- sbi->s_flex_groups[flex_group].free_blocks +=
- input->free_blocks_count;
- sbi->s_flex_groups[flex_group].free_inodes +=
- EXT4_INODES_PER_GROUP(sb);
+ atomic_add(input->free_blocks_count,
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(EXT4_INODES_PER_GROUP(sb),
+ &sbi->s_flex_groups[flex_group].free_inodes);
}
ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923..9987bba99db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/ctype.h>
#include <linux/marker.h>
#include <linux/log2.h>
#include <linux/crc16.h>
@@ -48,6 +49,7 @@
#include "group.h"
struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_commit_super(sb, es, 1);
}
if (sbi->s_proc) {
- remove_proc_entry("inode_readahead_blks", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
+ kobject_del(&sbi->s_kobj);
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
ext4_blkdev_remove(sbi);
}
sb->s_fs_info = NULL;
+ /*
+ * Now that we are completely done shutting down the
+ * superblock, we need to actually destroy the kobject.
+ */
+ unlock_kernel();
+ unlock_super(sb);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ lock_super(sb);
+ lock_kernel();
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
return;
}
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
seq_puts(seq, ",noacl");
#endif
- if (!test_opt(sb, RESERVATION))
- seq_puts(seq, ",noreservation");
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
seq_printf(seq, ",commit=%u",
(unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
+ if (test_opt(sb, NO_AUTO_DA_ALLOC))
+ seq_puts(seq, ",noauto_da_alloc");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -1004,7 +1018,7 @@ enum {
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+ Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_i_version,
+ Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_inode_readahead_blks, Opt_journal_ioprio
};
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
- {Opt_reservation, "reservation"},
- {Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
{Opt_nodelalloc, "nodelalloc"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
{Opt_journal_ioprio, "journal_ioprio=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_err, NULL},
};
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
"not supported\n");
break;
#endif
- case Opt_reservation:
- set_opt(sbi->s_mount_opt, RESERVATION);
- break;
- case Opt_noreservation:
- clear_opt(sbi->s_mount_opt, RESERVATION);
- break;
case Opt_journal_update:
/* @@@ FIXME */
/* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
case Opt_abort:
set_opt(sbi->s_mount_opt, ABORT);
break;
+ case Opt_nobarrier:
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
case Opt_barrier:
- if (match_int(&args[0], &option))
- return 0;
+ if (match_int(&args[0], &option)) {
+ set_opt(sbi->s_mount_opt, BARRIER);
+ break;
+ }
if (option)
set_opt(sbi->s_mount_opt, BARRIER);
else
@@ -1463,6 +1479,11 @@ set_qf_format:
return 0;
if (option < 0 || option > (1 << 30))
return 0;
+ if (option & (option - 1)) {
+ printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+ " must be a power of 2\n");
+ return 0;
+ }
sbi->s_inode_readahead_blks = option;
break;
case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
option);
break;
+ case Opt_noauto_da_alloc:
+ set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+ break;
+ case Opt_auto_da_alloc:
+ if (match_int(&args[0], &option)) {
+ clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+ break;
+ }
+ if (option)
+ clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+ else
+ set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+ break;
default:
printk(KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, &bh);
flex_group = ext4_flex_group(sbi, i);
- sbi->s_flex_groups[flex_group].free_inodes +=
- ext4_free_inodes_count(sb, gdp);
- sbi->s_flex_groups[flex_group].free_blocks +=
- ext4_free_blks_count(sb, gdp);
+ atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+ ext4_free_inodes_count(sb, gdp));
+ atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+ ext4_free_blks_count(sb, gdp));
+ atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+ ext4_used_dirs_count(sb, gdp));
}
return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
return 0;
}
+/* sysfs supprt */
+
+struct ext4_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+ ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+ const char *, size_t);
+ int offset;
+};
+
+static int parse_strtoul(const char *buf,
+ unsigned long max, unsigned long *value)
+{
+ char *endp;
+
+ while (*buf && isspace(*buf))
+ buf++;
+ *value = simple_strtoul(buf, &endp, 0);
+ while (*endp && isspace(*endp))
+ endp++;
+ if (*endp || *value > max)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+
+ if (parse_strtoul(buf, 0x40000000, &t))
+ return -EINVAL;
+
+ /* inode_readahead_blks must be a power of 2 */
+ if (t & (t-1))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned long t;
+
+ if (parse_strtoul(buf, 0xffffffff, &t))
+ return -EINVAL;
+ *ui = t;
+ return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .offset = offsetof(struct ext4_sb_info, _elname), \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname) \
+ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+ inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+
+static struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ return -ENOMEM;
+ }
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT4_DEF_RESUID;
sbi->s_resgid = EXT4_DEF_RESGID;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
+ sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+ sectors[1]);
unlock_kernel();
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT4_SUPER_MAGIC)
goto cantfind_ext4;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
- set_opt(sbi->s_mount_opt, RESERVATION);
set_opt(sbi->s_mount_opt, BARRIER);
/*
@@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_PROC_FS
if (ext4_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
- if (sbi->s_proc)
- proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
- &ext4_ui_proc_fops,
- &sbi->s_inode_readahead_blks);
#endif
- bgl_lock_init(&sbi->s_blockgroup_lock);
+ bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2779,16 @@ no_journal:
goto failed_mount4;
}
+ sbi->s_kobj.kset = ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+ "%s", sb->s_id);
+ if (err) {
+ ext4_mb_release(sb);
+ ext4_ext_release(sb);
+ goto failed_mount4;
+ };
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2843,6 @@ failed_mount2:
kfree(sbi->s_group_desc);
failed_mount:
if (sbi->s_proc) {
- remove_proc_entry("inode_readahead_blks", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
#ifdef CONFIG_QUOTA
@@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
set_buffer_uptodate(sbh);
}
es->s_wtime = cpu_to_le32(get_seconds());
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeblocks_counter));
es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
}
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
- unsigned int *p = m->private;
-
- seq_printf(m, "%u\n", *p);
- return 0;
-}
-
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
- char str[32];
-
- if (cnt >= sizeof(str))
- return -EINVAL;
- if (copy_from_user(str, buf, cnt))
- return -EFAULT;
-
- *p = simple_strtoul(str, NULL, 0);
- return cnt;
-}
-
-const struct file_operations ext4_ui_proc_fops = {
- .owner = THIS_MODULE,
- .open = ext4_ui_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = ext4_ui_proc_write,
-};
-#endif
-
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
.name = "ext4",
@@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void)
{
int err;
+ ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+ if (!ext4_kset)
+ return -ENOMEM;
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = init_ext4_mballoc();
if (err)
@@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
exit_ext4_xattr();
exit_ext4_mballoc();
remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e0..296785a0dec 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -523,7 +523,9 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
+ struct super_block *sb = dentry->d_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
/* If the count of free cluster is still unknown, counts it here. */
if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
@@ -537,6 +539,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
buf->f_bfree = sbi->free_clusters;
buf->f_bavail = sbi->free_clusters;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = sbi->options.isvfat ? 260 : 12;
return 0;
@@ -930,7 +934,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
opts->fs_uid = current_uid();
opts->fs_gid = current_gid();
- opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+ opts->fs_fmask = current_umask();
opts->allow_utime = -1;
opts->codepage = fat_default_codepage;
opts->iocharset = fat_default_iocharset;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e3fe9918faa..eed48063990 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -196,7 +196,7 @@ static void redirty_tail(struct inode *inode)
struct inode *tail_inode;
tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
- if (!time_after_eq(inode->dirtied_when,
+ if (time_before(inode->dirtied_when,
tail_inode->dirtied_when))
inode->dirtied_when = jiffies;
}
@@ -220,6 +220,21 @@ static void inode_sync_complete(struct inode *inode)
wake_up_bit(&inode->i_state, __I_SYNC);
}
+static bool inode_dirtied_after(struct inode *inode, unsigned long t)
+{
+ bool ret = time_after(inode->dirtied_when, t);
+#ifndef CONFIG_64BIT
+ /*
+ * For inodes being constantly redirtied, dirtied_when can get stuck.
+ * It _appears_ to be in the future, but is actually in distant past.
+ * This test is necessary to prevent such wrapped-around relative times
+ * from permanently stopping the whole pdflush writeback.
+ */
+ ret = ret && time_before_eq(inode->dirtied_when, jiffies);
+#endif
+ return ret;
+}
+
/*
* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
*/
@@ -231,7 +246,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
struct inode *inode = list_entry(delaying_queue->prev,
struct inode, i_list);
if (older_than_this &&
- time_after(inode->dirtied_when, *older_than_this))
+ inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_list, dispatch_queue);
}
@@ -492,8 +507,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
continue; /* blockdev has wrong queue */
}
- /* Was this inode dirtied after sync_sb_inodes was called? */
- if (time_after(inode->dirtied_when, start))
+ /*
+ * Was this inode dirtied after sync_sb_inodes was called?
+ * This keeps sync from extra jobs and livelock.
+ */
+ if (inode_dirtied_after(inode, start))
break;
/* Is another pdflush already flushing this queue? */
@@ -538,7 +556,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
struct address_space *mapping;
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+ if (inode->i_state &
+ (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
continue;
mapping = inode->i_mapping;
if (mapping->nrpages == 0)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 00000000000..eee059052db
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,177 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+ struct path old_root;
+
+ write_lock(&fs->lock);
+ old_root = fs->root;
+ fs->root = *path;
+ path_get(path);
+ write_unlock(&fs->lock);
+ if (old_root.dentry)
+ path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+ struct path old_pwd;
+
+ write_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ fs->pwd = *path;
+ path_get(path);
+ write_unlock(&fs->lock);
+
+ if (old_pwd.dentry)
+ path_put(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+ struct task_struct *g, *p;
+ struct fs_struct *fs;
+ int count = 0;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_lock(p);
+ fs = p->fs;
+ if (fs) {
+ write_lock(&fs->lock);
+ if (fs->root.dentry == old_root->dentry
+ && fs->root.mnt == old_root->mnt) {
+ path_get(new_root);
+ fs->root = *new_root;
+ count++;
+ }
+ if (fs->pwd.dentry == old_root->dentry
+ && fs->pwd.mnt == old_root->mnt) {
+ path_get(new_root);
+ fs->pwd = *new_root;
+ count++;
+ }
+ write_unlock(&fs->lock);
+ }
+ task_unlock(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ while (count--)
+ path_put(old_root);
+}
+
+void free_fs_struct(struct fs_struct *fs)
+{
+ path_put(&fs->root);
+ path_put(&fs->pwd);
+ kmem_cache_free(fs_cachep, fs);
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+ struct fs_struct *fs = tsk->fs;
+
+ if (fs) {
+ int kill;
+ task_lock(tsk);
+ write_lock(&fs->lock);
+ tsk->fs = NULL;
+ kill = !--fs->users;
+ write_unlock(&fs->lock);
+ task_unlock(tsk);
+ if (kill)
+ free_fs_struct(fs);
+ }
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+ struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+ /* We don't need to lock fs - think why ;-) */
+ if (fs) {
+ fs->users = 1;
+ fs->in_exec = 0;
+ rwlock_init(&fs->lock);
+ fs->umask = old->umask;
+ read_lock(&old->lock);
+ fs->root = old->root;
+ path_get(&old->root);
+ fs->pwd = old->pwd;
+ path_get(&old->pwd);
+ read_unlock(&old->lock);
+ }
+ return fs;
+}
+
+int unshare_fs_struct(void)
+{
+ struct fs_struct *fs = current->fs;
+ struct fs_struct *new_fs = copy_fs_struct(fs);
+ int kill;
+
+ if (!new_fs)
+ return -ENOMEM;
+
+ task_lock(current);
+ write_lock(&fs->lock);
+ kill = !--fs->users;
+ current->fs = new_fs;
+ write_unlock(&fs->lock);
+ task_unlock(current);
+
+ if (kill)
+ free_fs_struct(fs);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+int current_umask(void)
+{
+ return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+ .users = 1,
+ .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
+ .umask = 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+ struct fs_struct *fs = current->fs;
+
+ if (fs) {
+ int kill;
+
+ task_lock(current);
+
+ write_lock(&init_fs.lock);
+ init_fs.users++;
+ write_unlock(&init_fs.lock);
+
+ write_lock(&fs->lock);
+ current->fs = &init_fs;
+ kill = !--fs->users;
+ write_unlock(&fs->lock);
+
+ task_unlock(current);
+ if (kill)
+ free_fs_struct(fs);
+ }
+}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644
index 00000000000..9bbb8ce7bea
--- /dev/null
+++ b/fs/fscache/Kconfig
@@ -0,0 +1,56 @@
+
+config FSCACHE
+ tristate "General filesystem local caching manager"
+ depends on EXPERIMENTAL
+ select SLOW_WORK
+ help
+ This option enables a generic filesystem caching manager that can be
+ used by various network and other filesystems to cache data locally.
+ Different sorts of caches can be plugged in, depending on the
+ resources available.
+
+ See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_STATS
+ bool "Gather statistical information on local caching"
+ depends on FSCACHE && PROC_FS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+ bool "Gather latency information on local caching"
+ depends on FSCACHE && PROC_FS
+ help
+ This option causes latency information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/histogram
+
+ The generation of this histogram adds a certain amount of overhead to
+ execution as there are a number of points at which data is gathered,
+ and on a multi-CPU system these may be on cachelines that keep
+ bouncing between CPUs. On the other hand, the histogram may be
+ useful for debugging purposes. Saying 'N' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_DEBUG
+ bool "Debug FS-Cache"
+ depends on FSCACHE
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+ See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644
index 00000000000..91571b95aac
--- /dev/null
+++ b/fs/fscache/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for general filesystem caching code
+#
+
+fscache-y := \
+ cache.o \
+ cookie.o \
+ fsdef.o \
+ main.o \
+ netfs.o \
+ object.o \
+ operation.o \
+ page.o
+
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644
index 00000000000..e21985bbb1f
--- /dev/null
+++ b/fs/fscache/cache.c
@@ -0,0 +1,415 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+ struct fscache_cache_tag *tag, *xtag;
+
+ /* firstly check for the existence of the tag under read lock */
+ down_read(&fscache_addremove_sem);
+
+ list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+ if (strcmp(tag->name, name) == 0) {
+ atomic_inc(&tag->usage);
+ up_read(&fscache_addremove_sem);
+ return tag;
+ }
+ }
+
+ up_read(&fscache_addremove_sem);
+
+ /* the tag does not exist - create a candidate */
+ xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+ if (!xtag)
+ /* return a dummy tag if out of memory */
+ return ERR_PTR(-ENOMEM);
+
+ atomic_set(&xtag->usage, 1);
+ strcpy(xtag->name, name);
+
+ /* write lock, search again and add if still not present */
+ down_write(&fscache_addremove_sem);
+
+ list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+ if (strcmp(tag->name, name) == 0) {
+ atomic_inc(&tag->usage);
+ up_write(&fscache_addremove_sem);
+ kfree(xtag);
+ return tag;
+ }
+ }
+
+ list_add_tail(&xtag->link, &fscache_cache_tag_list);
+ up_write(&fscache_addremove_sem);
+ return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+ if (tag != ERR_PTR(-ENOMEM)) {
+ down_write(&fscache_addremove_sem);
+
+ if (atomic_dec_and_test(&tag->usage))
+ list_del_init(&tag->link);
+ else
+ tag = NULL;
+
+ up_write(&fscache_addremove_sem);
+
+ kfree(tag);
+ }
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+ struct fscache_cookie *cookie)
+{
+ struct fscache_cache_tag *tag;
+ struct fscache_object *object;
+ struct fscache_cache *cache;
+
+ _enter("");
+
+ if (list_empty(&fscache_cache_list)) {
+ _leave(" = NULL [no cache]");
+ return NULL;
+ }
+
+ /* we check the parent to determine the cache to use */
+ spin_lock(&cookie->lock);
+
+ /* the first in the parent's backing list should be the preferred
+ * cache */
+ if (!hlist_empty(&cookie->backing_objects)) {
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ cache = object->cache;
+ if (object->state >= FSCACHE_OBJECT_DYING ||
+ test_bit(FSCACHE_IOERROR, &cache->flags))
+ cache = NULL;
+
+ spin_unlock(&cookie->lock);
+ _leave(" = %p [parent]", cache);
+ return cache;
+ }
+
+ /* the parent is unbacked */
+ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ /* cookie not an index and is unbacked */
+ spin_unlock(&cookie->lock);
+ _leave(" = NULL [cookie ub,ni]");
+ return NULL;
+ }
+
+ spin_unlock(&cookie->lock);
+
+ if (!cookie->def->select_cache)
+ goto no_preference;
+
+ /* ask the netfs for its preference */
+ tag = cookie->def->select_cache(cookie->parent->netfs_data,
+ cookie->netfs_data);
+ if (!tag)
+ goto no_preference;
+
+ if (tag == ERR_PTR(-ENOMEM)) {
+ _leave(" = NULL [nomem tag]");
+ return NULL;
+ }
+
+ if (!tag->cache) {
+ _leave(" = NULL [unbacked tag]");
+ return NULL;
+ }
+
+ if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+ return NULL;
+
+ _leave(" = %p [specific]", tag->cache);
+ return tag->cache;
+
+no_preference:
+ /* netfs has no preference - just select first cache */
+ cache = list_entry(fscache_cache_list.next,
+ struct fscache_cache, link);
+ _leave(" = %p [first]", cache);
+ return cache;
+}
+
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+ const struct fscache_cache_ops *ops,
+ const char *idfmt,
+ ...)
+{
+ va_list va;
+
+ memset(cache, 0, sizeof(*cache));
+
+ cache->ops = ops;
+
+ va_start(va, idfmt);
+ vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+ va_end(va);
+
+ INIT_WORK(&cache->op_gc, fscache_operation_gc);
+ INIT_LIST_HEAD(&cache->link);
+ INIT_LIST_HEAD(&cache->object_list);
+ INIT_LIST_HEAD(&cache->op_gc_list);
+ spin_lock_init(&cache->object_list_lock);
+ spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+ struct fscache_object *ifsdef,
+ const char *tagname)
+{
+ struct fscache_cache_tag *tag;
+
+ BUG_ON(!cache->ops);
+ BUG_ON(!ifsdef);
+
+ cache->flags = 0;
+ ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+
+ if (!tagname)
+ tagname = cache->identifier;
+
+ BUG_ON(!tagname[0]);
+
+ _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+
+ /* we use the cache tag to uniquely identify caches */
+ tag = __fscache_lookup_cache_tag(tagname);
+ if (IS_ERR(tag))
+ goto nomem;
+
+ if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+ goto tag_in_use;
+
+ cache->kobj = kobject_create_and_add(tagname, fscache_root);
+ if (!cache->kobj)
+ goto error;
+
+ ifsdef->cookie = &fscache_fsdef_index;
+ ifsdef->cache = cache;
+ cache->fsdef = ifsdef;
+
+ down_write(&fscache_addremove_sem);
+
+ tag->cache = cache;
+ cache->tag = tag;
+
+ /* add the cache to the list */
+ list_add(&cache->link, &fscache_cache_list);
+
+ /* add the cache's netfs definition index object to the cache's
+ * list */
+ spin_lock(&cache->object_list_lock);
+ list_add_tail(&ifsdef->cache_link, &cache->object_list);
+ spin_unlock(&cache->object_list_lock);
+
+ /* add the cache's netfs definition index object to the top level index
+ * cookie as a known backing object */
+ spin_lock(&fscache_fsdef_index.lock);
+
+ hlist_add_head(&ifsdef->cookie_link,
+ &fscache_fsdef_index.backing_objects);
+
+ atomic_inc(&fscache_fsdef_index.usage);
+
+ /* done */
+ spin_unlock(&fscache_fsdef_index.lock);
+ up_write(&fscache_addremove_sem);
+
+ printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
+ cache->tag->name, cache->ops->name);
+ kobject_uevent(cache->kobj, KOBJ_ADD);
+
+ _leave(" = 0 [%s]", cache->identifier);
+ return 0;
+
+tag_in_use:
+ printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+ __fscache_release_cache_tag(tag);
+ _leave(" = -EXIST");
+ return -EEXIST;
+
+error:
+ __fscache_release_cache_tag(tag);
+ _leave(" = -EINVAL");
+ return -EINVAL;
+
+nomem:
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything. This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+ set_bit(FSCACHE_IOERROR, &cache->flags);
+
+ printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+ cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+ struct list_head *dying_objects)
+{
+ struct fscache_object *object;
+
+ spin_lock(&cache->object_list_lock);
+
+ while (!list_empty(&cache->object_list)) {
+ object = list_entry(cache->object_list.next,
+ struct fscache_object, cache_link);
+ list_move_tail(&object->cache_link, dying_objects);
+
+ _debug("withdraw %p", object->cookie);
+
+ spin_lock(&object->lock);
+ spin_unlock(&cache->object_list_lock);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+ spin_unlock(&object->lock);
+
+ cond_resched();
+ spin_lock(&cache->object_list_lock);
+ }
+
+ spin_unlock(&cache->object_list_lock);
+}
+
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+ LIST_HEAD(dying_objects);
+
+ _enter("");
+
+ printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
+ cache->tag->name);
+
+ /* make the cache unavailable for cookie acquisition */
+ if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+ BUG();
+
+ down_write(&fscache_addremove_sem);
+ list_del_init(&cache->link);
+ cache->tag->cache = NULL;
+ up_write(&fscache_addremove_sem);
+
+ /* make sure all pages pinned by operations on behalf of the netfs are
+ * written to disk */
+ cache->ops->sync_cache(cache);
+
+ /* dissociate all the netfs pages backed by this cache from the block
+ * mappings in the cache */
+ cache->ops->dissociate_pages(cache);
+
+ /* we now have to destroy all the active objects pertaining to this
+ * cache - which we do by passing them off to thread pool to be
+ * disposed of */
+ _debug("destroy");
+
+ fscache_withdraw_all_objects(cache, &dying_objects);
+
+ /* wait for all extant objects to finish their outstanding operations
+ * and go away */
+ _debug("wait for finish");
+ wait_event(fscache_cache_cleared_wq,
+ atomic_read(&cache->object_count) == 0);
+ _debug("wait for clearance");
+ wait_event(fscache_cache_cleared_wq,
+ list_empty(&cache->object_list));
+ _debug("cleared");
+ ASSERT(list_empty(&dying_objects));
+
+ kobject_put(cache->kobj);
+
+ clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+ fscache_release_cache_tag(cache->tag);
+ cache->tag = NULL;
+
+ _leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644
index 00000000000..72fd18f6c71
--- /dev/null
+++ b/fs/fscache/cookie.c
@@ -0,0 +1,500 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct kmem_cache *fscache_cookie_jar;
+
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+ struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+ struct fscache_object *object);
+
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+ struct fscache_cookie *cookie = _cookie;
+
+ memset(cookie, 0, sizeof(*cookie));
+ spin_lock_init(&cookie->lock);
+ INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ * - the top level index cookie for each netfs is stored in the fscache_netfs
+ * struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ * needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ * indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ * - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+ struct fscache_cookie *parent,
+ const struct fscache_cookie_def *def,
+ void *netfs_data)
+{
+ struct fscache_cookie *cookie;
+
+ BUG_ON(!def);
+
+ _enter("{%s},{%s},%p",
+ parent ? (char *) parent->def->name : "<no-parent>",
+ def->name, netfs_data);
+
+ fscache_stat(&fscache_n_acquires);
+
+ /* if there's no parent cookie, then we don't create one here either */
+ if (!parent) {
+ fscache_stat(&fscache_n_acquires_null);
+ _leave(" [no parent]");
+ return NULL;
+ }
+
+ /* validate the definition */
+ BUG_ON(!def->get_key);
+ BUG_ON(!def->name[0]);
+
+ BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+ parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+ /* allocate and initialise a cookie */
+ cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+ if (!cookie) {
+ fscache_stat(&fscache_n_acquires_oom);
+ _leave(" [ENOMEM]");
+ return NULL;
+ }
+
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+
+ atomic_inc(&parent->usage);
+ atomic_inc(&parent->n_children);
+
+ cookie->def = def;
+ cookie->parent = parent;
+ cookie->netfs_data = netfs_data;
+ cookie->flags = 0;
+
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+
+ switch (cookie->def->type) {
+ case FSCACHE_COOKIE_TYPE_INDEX:
+ fscache_stat(&fscache_n_cookie_index);
+ break;
+ case FSCACHE_COOKIE_TYPE_DATAFILE:
+ fscache_stat(&fscache_n_cookie_data);
+ break;
+ default:
+ fscache_stat(&fscache_n_cookie_special);
+ break;
+ }
+
+ /* if the object is an index then we need do nothing more here - we
+ * create indices on disk when we need them as an index may exist in
+ * multiple caches */
+ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (fscache_acquire_non_index_cookie(cookie) < 0) {
+ atomic_dec(&parent->n_children);
+ __fscache_cookie_put(cookie);
+ fscache_stat(&fscache_n_acquires_nobufs);
+ _leave(" = NULL");
+ return NULL;
+ }
+ }
+
+ fscache_stat(&fscache_n_acquires_ok);
+ _leave(" = %p", cookie);
+ return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ * object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+ struct fscache_object *object;
+ struct fscache_cache *cache;
+ uint64_t i_size;
+ int ret;
+
+ _enter("");
+
+ cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+
+ /* now we need to see whether the backing objects for this cookie yet
+ * exist, if not there'll be nothing to search */
+ down_read(&fscache_addremove_sem);
+
+ if (list_empty(&fscache_cache_list)) {
+ up_read(&fscache_addremove_sem);
+ _leave(" = 0 [no caches]");
+ return 0;
+ }
+
+ /* select a cache in which to store the object */
+ cache = fscache_select_cache_for_object(cookie->parent);
+ if (!cache) {
+ up_read(&fscache_addremove_sem);
+ fscache_stat(&fscache_n_acquires_no_cache);
+ _leave(" = -ENOMEDIUM [no cache]");
+ return -ENOMEDIUM;
+ }
+
+ _debug("cache %s", cache->tag->name);
+
+ cookie->flags =
+ (1 << FSCACHE_COOKIE_LOOKING_UP) |
+ (1 << FSCACHE_COOKIE_CREATING) |
+ (1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+ /* ask the cache to allocate objects for this cookie and its parent
+ * chain */
+ ret = fscache_alloc_object(cache, cookie);
+ if (ret < 0) {
+ up_read(&fscache_addremove_sem);
+ _leave(" = %d", ret);
+ return ret;
+ }
+
+ /* pass on how big the object we're caching is supposed to be */
+ cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+ spin_lock(&cookie->lock);
+ if (hlist_empty(&cookie->backing_objects)) {
+ spin_unlock(&cookie->lock);
+ goto unavailable;
+ }
+
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ fscache_set_store_limit(object, i_size);
+
+ /* initiate the process of looking up all the objects in the chain
+ * (done by fscache_initialise_object()) */
+ fscache_enqueue_object(object);
+
+ spin_unlock(&cookie->lock);
+
+ /* we may be required to wait for lookup to complete at this point */
+ if (!fscache_defer_lookup) {
+ _debug("non-deferred lookup %p", &cookie->flags);
+ wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ _debug("complete");
+ if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+ goto unavailable;
+ }
+
+ up_read(&fscache_addremove_sem);
+ _leave(" = 0 [deferred]");
+ return 0;
+
+unavailable:
+ up_read(&fscache_addremove_sem);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+ struct fscache_cookie *cookie)
+{
+ struct fscache_object *object;
+ struct hlist_node *_n;
+ int ret;
+
+ _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+ spin_lock(&cookie->lock);
+ hlist_for_each_entry(object, _n, &cookie->backing_objects,
+ cookie_link) {
+ if (object->cache == cache)
+ goto object_already_extant;
+ }
+ spin_unlock(&cookie->lock);
+
+ /* ask the cache to allocate an object (we may end up with duplicate
+ * objects at this stage, but we sort that out later) */
+ object = cache->ops->alloc_object(cache, cookie);
+ if (IS_ERR(object)) {
+ fscache_stat(&fscache_n_object_no_alloc);
+ ret = PTR_ERR(object);
+ goto error;
+ }
+
+ fscache_stat(&fscache_n_object_alloc);
+
+ object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+ _debug("ALLOC OBJ%x: %s {%lx}",
+ object->debug_id, cookie->def->name, object->events);
+
+ ret = fscache_alloc_object(cache, cookie->parent);
+ if (ret < 0)
+ goto error_put;
+
+ /* only attach if we managed to allocate all we needed, otherwise
+ * discard the object we just allocated and instead use the one
+ * attached to the cookie */
+ if (fscache_attach_object(cookie, object) < 0)
+ cache->ops->put_object(object);
+
+ _leave(" = 0");
+ return 0;
+
+object_already_extant:
+ ret = -ENOBUFS;
+ if (object->state >= FSCACHE_OBJECT_DYING) {
+ spin_unlock(&cookie->lock);
+ goto error;
+ }
+ spin_unlock(&cookie->lock);
+ _leave(" = 0 [found]");
+ return 0;
+
+error_put:
+ cache->ops->put_object(object);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+ struct fscache_object *object)
+{
+ struct fscache_object *p;
+ struct fscache_cache *cache = object->cache;
+ struct hlist_node *_n;
+ int ret;
+
+ _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+ spin_lock(&cookie->lock);
+
+ /* there may be multiple initial creations of this object, but we only
+ * want one */
+ ret = -EEXIST;
+ hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+ if (p->cache == object->cache) {
+ if (p->state >= FSCACHE_OBJECT_DYING)
+ ret = -ENOBUFS;
+ goto cant_attach_object;
+ }
+ }
+
+ /* pin the parent object */
+ spin_lock_nested(&cookie->parent->lock, 1);
+ hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+ cookie_link) {
+ if (p->cache == object->cache) {
+ if (p->state >= FSCACHE_OBJECT_DYING) {
+ ret = -ENOBUFS;
+ spin_unlock(&cookie->parent->lock);
+ goto cant_attach_object;
+ }
+ object->parent = p;
+ spin_lock(&p->lock);
+ p->n_children++;
+ spin_unlock(&p->lock);
+ break;
+ }
+ }
+ spin_unlock(&cookie->parent->lock);
+
+ /* attach to the cache's object list */
+ if (list_empty(&object->cache_link)) {
+ spin_lock(&cache->object_list_lock);
+ list_add(&object->cache_link, &cache->object_list);
+ spin_unlock(&cache->object_list_lock);
+ }
+
+ /* attach to the cookie */
+ object->cookie = cookie;
+ atomic_inc(&cookie->usage);
+ hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+ ret = 0;
+
+cant_attach_object:
+ spin_unlock(&cookie->lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+ struct fscache_object *object;
+ struct hlist_node *_p;
+
+ fscache_stat(&fscache_n_updates);
+
+ if (!cookie) {
+ fscache_stat(&fscache_n_updates_null);
+ _leave(" [no cookie]");
+ return;
+ }
+
+ _enter("{%s}", cookie->def->name);
+
+ BUG_ON(!cookie->def->get_aux);
+
+ spin_lock(&cookie->lock);
+
+ /* update the index entry on disk in each cache backing this cookie */
+ hlist_for_each_entry(object, _p,
+ &cookie->backing_objects, cookie_link) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+ }
+
+ spin_unlock(&cookie->lock);
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ * (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+ struct fscache_cache *cache;
+ struct fscache_object *object;
+ unsigned long event;
+
+ fscache_stat(&fscache_n_relinquishes);
+
+ if (!cookie) {
+ fscache_stat(&fscache_n_relinquishes_null);
+ _leave(" [no cookie]");
+ return;
+ }
+
+ _enter("%p{%s,%p},%d",
+ cookie, cookie->def->name, cookie->netfs_data, retire);
+
+ if (atomic_read(&cookie->n_children) != 0) {
+ printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+ cookie->def->name);
+ BUG();
+ }
+
+ /* wait for the cookie to finish being instantiated (or to fail) */
+ if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+ fscache_stat(&fscache_n_relinquishes_waitcrt);
+ wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ }
+
+ event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+
+ /* detach pointers back to the netfs */
+ spin_lock(&cookie->lock);
+
+ cookie->netfs_data = NULL;
+ cookie->def = NULL;
+
+ /* break links with all the active objects */
+ while (!hlist_empty(&cookie->backing_objects)) {
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object,
+ cookie_link);
+
+ _debug("RELEASE OBJ%x", object->debug_id);
+
+ /* detach each cache object from the object cookie */
+ spin_lock(&object->lock);
+ hlist_del_init(&object->cookie_link);
+
+ cache = object->cache;
+ object->cookie = NULL;
+ fscache_raise_event(object, event);
+ spin_unlock(&object->lock);
+
+ if (atomic_dec_and_test(&cookie->usage))
+ /* the cookie refcount shouldn't be reduced to 0 yet */
+ BUG();
+ }
+
+ spin_unlock(&cookie->lock);
+
+ if (cookie->parent) {
+ ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+ ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+ atomic_dec(&cookie->parent->n_children);
+ }
+
+ /* finally dispose of the cookie */
+ ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+ fscache_cookie_put(cookie);
+
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+ struct fscache_cookie *parent;
+
+ _enter("%p", cookie);
+
+ for (;;) {
+ _debug("FREE COOKIE %p", cookie);
+ parent = cookie->parent;
+ BUG_ON(!hlist_empty(&cookie->backing_objects));
+ kmem_cache_free(fscache_cookie_jar, cookie);
+
+ if (!parent)
+ break;
+
+ cookie = parent;
+ BUG_ON(atomic_read(&cookie->usage) <= 0);
+ if (!atomic_dec_and_test(&cookie->usage))
+ break;
+ }
+
+ _leave("");
+}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644
index 00000000000..f5b4baee735
--- /dev/null
+++ b/fs/fscache/fsdef.c
@@ -0,0 +1,144 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax);
+
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax);
+
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen);
+
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ * FSDEF
+ * |
+ * +-----------+
+ * | |
+ * NFS AFS
+ * [v=1] [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared. If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs. This is then the root handle by which the netfs accesses the
+ * cache. It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+ .name = ".FS-Cache",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+struct fscache_cookie fscache_fsdef_index = {
+ .usage = ATOMIC_INIT(1),
+ .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+ .backing_objects = HLIST_HEAD_INIT,
+ .def = &fscache_fsdef_index_def,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+
+/*
+ * Definition of an entry in the root index. Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+ .name = "FSDEF.netfs",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = fscache_fsdef_netfs_get_key,
+ .get_aux = fscache_fsdef_netfs_get_aux,
+ .check_aux = fscache_fsdef_netfs_check_aux,
+};
+
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct fscache_netfs *netfs = cookie_netfs_data;
+ unsigned klen;
+
+ _enter("{%s.%u},", netfs->name, netfs->version);
+
+ klen = strlen(netfs->name);
+ if (klen > bufmax)
+ return 0;
+
+ memcpy(buffer, netfs->name, klen);
+ return klen;
+}
+
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct fscache_netfs *netfs = cookie_netfs_data;
+ unsigned dlen;
+
+ _enter("{%s.%u},", netfs->name, netfs->version);
+
+ dlen = sizeof(uint32_t);
+ if (dlen > bufmax)
+ return 0;
+
+ memcpy(buffer, &netfs->version, dlen);
+ return dlen;
+}
+
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+ void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen)
+{
+ struct fscache_netfs *netfs = cookie_netfs_data;
+ uint32_t version;
+
+ _enter("{%s},,%hu", netfs->name, datalen);
+
+ if (datalen != sizeof(version)) {
+ _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+ return FSCACHE_CHECKAUX_OBSOLETE;
+ }
+
+ memcpy(&version, data, sizeof(version));
+ if (version != netfs->version) {
+ _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+ return FSCACHE_CHECKAUX_OBSOLETE;
+ }
+
+ _leave(" = OKAY");
+ return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644
index 00000000000..bad496748a5
--- /dev/null
+++ b/fs/fscache/histogram.c
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+ unsigned long index;
+ unsigned n[5], t;
+
+ switch ((unsigned long) v) {
+ case 1:
+ seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS "
+ " RETRV DLY RETRIEVLS\n");
+ return 0;
+ case 2:
+ seq_puts(m, "===== ===== ========= ========= ========="
+ " ========= =========\n");
+ return 0;
+ default:
+ index = (unsigned long) v - 3;
+ n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+ n[1] = atomic_read(&fscache_ops_histogram[index]);
+ n[2] = atomic_read(&fscache_objs_histogram[index]);
+ n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+ n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+ if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+ return 0;
+
+ t = (index * 1000) / HZ;
+
+ seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n",
+ index, t, n[0], n[1], n[2], n[3], n[4]);
+ return 0;
+ }
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+ if ((unsigned long long)*_pos >= HZ + 2)
+ return NULL;
+ if (*_pos == 0)
+ *_pos = 1;
+ return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return (unsigned long long)*pos > HZ + 2 ?
+ NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+ .start = fscache_histogram_start,
+ .stop = fscache_histogram_stop,
+ .next = fscache_histogram_next,
+ .show = fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+ .owner = THIS_MODULE,
+ .open = fscache_histogram_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644
index 00000000000..e0cbd16f6dc
--- /dev/null
+++ b/fs/fscache/internal.h
@@ -0,0 +1,380 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - fscache_thread_lock
+ *
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+
+#define FSCACHE_MIN_THREADS 4
+#define FSCACHE_MAX_THREADS 32
+
+/*
+ * fsc-cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+ struct fscache_cookie *);
+
+/*
+ * fsc-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+
+/*
+ * fsc-fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+
+/*
+ * fsc-histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+ unsigned long jif = jiffies - start_jif;
+ if (jif >= HZ)
+ jif = HZ - 1;
+ atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * fsc-main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+
+extern int fscache_wait_bit(void *);
+extern int fscache_wait_bit_interruptible(void *);
+
+/*
+ * fsc-object.c
+ */
+extern void fscache_withdrawing_object(struct fscache_cache *,
+ struct fscache_object *);
+extern void fscache_enqueue_object(struct fscache_object *);
+
+/*
+ * fsc-operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+ struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+ struct fscache_operation *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+
+/*
+ * fsc-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init() (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fsc-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define fscache_stat(stat) do {} while (0)
+#endif
+
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ * queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+ unsigned event)
+{
+ if (!test_and_set_bit(event, &object->events) &&
+ test_bit(event, &object->event_mask))
+ fscache_enqueue_object(object);
+}
+
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+ BUG_ON(atomic_read(&cookie->usage) <= 0);
+ if (atomic_dec_and_test(&cookie->usage))
+ __fscache_cookie_put(cookie);
+}
+
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+ if (cookie->def->get_context)
+ cookie->def->get_context(cookie->netfs_data, context);
+ return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+ if (cookie->def->put_context)
+ cookie->def->put_context(cookie->netfs_data, context);
+}
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+ printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf, 1, 2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...) \
+do { \
+ if (__do_kdebug(ENTER)) \
+ kenter(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#define _leave(FMT, ...) \
+do { \
+ if (__do_kdebug(LEAVE)) \
+ kleave(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#define _debug(FMT, ...) \
+do { \
+ if (__do_kdebug(DEBUG)) \
+ kdebug(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+ unlikely((fscache_debug & \
+ (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+ ____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+ ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+
+#define FSCACHE_DEBUG_CACHE 0
+#define FSCACHE_DEBUG_COOKIE 1
+#define FSCACHE_DEBUG_PAGE 2
+#define FSCACHE_DEBUG_OPERATION 3
+
+#define FSCACHE_POINT_ENTER 1
+#define FSCACHE_POINT_LEAVE 2
+#define FSCACHE_POINT_DEBUG 4
+
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
+ printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
+ printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644
index 00000000000..4de41b59749
--- /dev/null
+++ b/fs/fscache/main.c
@@ -0,0 +1,124 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+ "Defer cookie lookup to background thread");
+
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+ "Defer cookie creation to background thread");
+
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+ "FS-Cache debugging mask");
+
+struct kobject *fscache_root;
+
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+ int ret;
+
+ ret = slow_work_register_user();
+ if (ret < 0)
+ goto error_slow_work;
+
+ ret = fscache_proc_init();
+ if (ret < 0)
+ goto error_proc;
+
+ fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+ sizeof(struct fscache_cookie),
+ 0,
+ 0,
+ fscache_cookie_init_once);
+ if (!fscache_cookie_jar) {
+ printk(KERN_NOTICE
+ "FS-Cache: Failed to allocate a cookie jar\n");
+ ret = -ENOMEM;
+ goto error_cookie_jar;
+ }
+
+ fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+ if (!fscache_root)
+ goto error_kobj;
+
+ printk(KERN_NOTICE "FS-Cache: Loaded\n");
+ return 0;
+
+error_kobj:
+ kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+ fscache_proc_cleanup();
+error_proc:
+ slow_work_unregister_user();
+error_slow_work:
+ return ret;
+}
+
+fs_initcall(fscache_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+ _enter("");
+
+ kobject_put(fscache_root);
+ kmem_cache_destroy(fscache_cookie_jar);
+ fscache_proc_cleanup();
+ slow_work_unregister_user();
+ printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+}
+
+module_exit(fscache_exit);
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+int fscache_wait_bit(void *flags)
+{
+ schedule();
+ return 0;
+}
+EXPORT_SYMBOL(fscache_wait_bit);
+
+/*
+ * wait_on_bit() sleep function for interruptible waiting
+ */
+int fscache_wait_bit_interruptible(void *flags)
+{
+ schedule();
+ return signal_pending(current);
+}
+EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644
index 00000000000..e028b8eb1c4
--- /dev/null
+++ b/fs/fscache/netfs.c
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+ struct fscache_netfs *ptr;
+ int ret;
+
+ _enter("{%s}", netfs->name);
+
+ INIT_LIST_HEAD(&netfs->link);
+
+ /* allocate a cookie for the primary index */
+ netfs->primary_index =
+ kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+ if (!netfs->primary_index) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ /* initialise the primary index cookie */
+ atomic_set(&netfs->primary_index->usage, 1);
+ atomic_set(&netfs->primary_index->n_children, 0);
+
+ netfs->primary_index->def = &fscache_fsdef_netfs_def;
+ netfs->primary_index->parent = &fscache_fsdef_index;
+ netfs->primary_index->netfs_data = netfs;
+
+ atomic_inc(&netfs->primary_index->parent->usage);
+ atomic_inc(&netfs->primary_index->parent->n_children);
+
+ spin_lock_init(&netfs->primary_index->lock);
+ INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+ /* check the netfs type is not already present */
+ down_write(&fscache_addremove_sem);
+
+ ret = -EEXIST;
+ list_for_each_entry(ptr, &fscache_netfs_list, link) {
+ if (strcmp(ptr->name, netfs->name) == 0)
+ goto already_registered;
+ }
+
+ list_add(&netfs->link, &fscache_netfs_list);
+ ret = 0;
+
+ printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+ netfs->name);
+
+already_registered:
+ up_write(&fscache_addremove_sem);
+
+ if (ret < 0) {
+ netfs->primary_index->parent = NULL;
+ __fscache_cookie_put(netfs->primary_index);
+ netfs->primary_index = NULL;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+ _enter("{%s.%u}", netfs->name, netfs->version);
+
+ down_write(&fscache_addremove_sem);
+
+ list_del(&netfs->link);
+ fscache_relinquish_cookie(netfs->primary_index, 0);
+
+ up_write(&fscache_addremove_sem);
+
+ printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+ netfs->name);
+
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644
index 00000000000..392a41b1b79
--- /dev/null
+++ b/fs/fscache/object.c
@@ -0,0 +1,810 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include "internal.h"
+
+const char *fscache_object_states[] = {
+ [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
+ [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
+ [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
+ [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
+ [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
+ [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
+ [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
+ [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
+ [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT",
+ [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING",
+ [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING",
+ [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING",
+ [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD",
+};
+EXPORT_SYMBOL(fscache_object_states);
+
+static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_object_slow_work_execute(struct slow_work *);
+static void fscache_initialise_object(struct fscache_object *);
+static void fscache_lookup_object(struct fscache_object *);
+static void fscache_object_available(struct fscache_object *);
+static void fscache_release_object(struct fscache_object *);
+static void fscache_withdraw_object(struct fscache_object *);
+static void fscache_enqueue_dependents(struct fscache_object *);
+static void fscache_dequeue_object(struct fscache_object *);
+
+const struct slow_work_ops fscache_object_slow_work_ops = {
+ .get_ref = fscache_object_slow_work_get_ref,
+ .put_ref = fscache_object_slow_work_put_ref,
+ .execute = fscache_object_slow_work_execute,
+};
+EXPORT_SYMBOL(fscache_object_slow_work_ops);
+
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+ struct fscache_object *parent = object->parent;
+
+ _enter("OBJ%x {OBJ%x,%x}",
+ object->debug_id, parent->debug_id, parent->n_ops);
+
+ spin_lock_nested(&parent->lock, 1);
+ parent->n_ops--;
+ parent->n_obj_ops--;
+ if (parent->n_ops == 0)
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+ spin_unlock(&parent->lock);
+}
+
+/*
+ * process events that have been sent to an object's state machine
+ * - initiates parent lookup
+ * - does object lookup
+ * - does object creation
+ * - does object recycling and retirement
+ * - does object withdrawal
+ */
+static void fscache_object_state_machine(struct fscache_object *object)
+{
+ enum fscache_object_state new_state;
+
+ ASSERT(object != NULL);
+
+ _enter("{OBJ%x,%s,%lx}",
+ object->debug_id, fscache_object_states[object->state],
+ object->events);
+
+ switch (object->state) {
+ /* wait for the parent object to become ready */
+ case FSCACHE_OBJECT_INIT:
+ object->event_mask =
+ ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ fscache_initialise_object(object);
+ goto done;
+
+ /* look up the object metadata on disk */
+ case FSCACHE_OBJECT_LOOKING_UP:
+ fscache_lookup_object(object);
+ goto lookup_transit;
+
+ /* create the object metadata on disk */
+ case FSCACHE_OBJECT_CREATING:
+ fscache_lookup_object(object);
+ goto lookup_transit;
+
+ /* handle an object becoming available; start pending
+ * operations and queue dependent operations for processing */
+ case FSCACHE_OBJECT_AVAILABLE:
+ fscache_object_available(object);
+ goto active_transit;
+
+ /* normal running state */
+ case FSCACHE_OBJECT_ACTIVE:
+ goto active_transit;
+
+ /* update the object metadata on disk */
+ case FSCACHE_OBJECT_UPDATING:
+ clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
+ fscache_stat(&fscache_n_updates_run);
+ object->cache->ops->update_object(object);
+ goto active_transit;
+
+ /* handle an object dying during lookup or creation */
+ case FSCACHE_OBJECT_LC_DYING:
+ object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+ object->cache->ops->lookup_complete(object);
+
+ spin_lock(&object->lock);
+ object->state = FSCACHE_OBJECT_DYING;
+ if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+ &object->cookie->flags))
+ wake_up_bit(&object->cookie->flags,
+ FSCACHE_COOKIE_CREATING);
+ spin_unlock(&object->lock);
+
+ fscache_done_parent_op(object);
+
+ /* wait for completion of all active operations on this object
+ * and the death of all child objects of this object */
+ case FSCACHE_OBJECT_DYING:
+ dying:
+ clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
+ spin_lock(&object->lock);
+ _debug("dying OBJ%x {%d,%d}",
+ object->debug_id, object->n_ops, object->n_children);
+ if (object->n_ops == 0 && object->n_children == 0) {
+ object->event_mask &=
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ object->event_mask |=
+ (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+ (1 << FSCACHE_OBJECT_EV_RETIRE) |
+ (1 << FSCACHE_OBJECT_EV_RELEASE) |
+ (1 << FSCACHE_OBJECT_EV_ERROR);
+ } else {
+ object->event_mask &=
+ ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+ (1 << FSCACHE_OBJECT_EV_RETIRE) |
+ (1 << FSCACHE_OBJECT_EV_RELEASE) |
+ (1 << FSCACHE_OBJECT_EV_ERROR));
+ object->event_mask |=
+ 1 << FSCACHE_OBJECT_EV_CLEARED;
+ }
+ spin_unlock(&object->lock);
+ fscache_enqueue_dependents(object);
+ goto terminal_transit;
+
+ /* handle an abort during initialisation */
+ case FSCACHE_OBJECT_ABORT_INIT:
+ _debug("handle abort init %lx", object->events);
+ object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+
+ spin_lock(&object->lock);
+ fscache_dequeue_object(object);
+
+ object->state = FSCACHE_OBJECT_DYING;
+ if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+ &object->cookie->flags))
+ wake_up_bit(&object->cookie->flags,
+ FSCACHE_COOKIE_CREATING);
+ spin_unlock(&object->lock);
+ goto dying;
+
+ /* handle the netfs releasing an object and possibly marking it
+ * obsolete too */
+ case FSCACHE_OBJECT_RELEASING:
+ case FSCACHE_OBJECT_RECYCLING:
+ object->event_mask &=
+ ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+ (1 << FSCACHE_OBJECT_EV_RETIRE) |
+ (1 << FSCACHE_OBJECT_EV_RELEASE) |
+ (1 << FSCACHE_OBJECT_EV_ERROR));
+ fscache_release_object(object);
+ spin_lock(&object->lock);
+ object->state = FSCACHE_OBJECT_DEAD;
+ spin_unlock(&object->lock);
+ fscache_stat(&fscache_n_object_dead);
+ goto terminal_transit;
+
+ /* handle the parent cache of this object being withdrawn from
+ * active service */
+ case FSCACHE_OBJECT_WITHDRAWING:
+ object->event_mask &=
+ ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+ (1 << FSCACHE_OBJECT_EV_RETIRE) |
+ (1 << FSCACHE_OBJECT_EV_RELEASE) |
+ (1 << FSCACHE_OBJECT_EV_ERROR));
+ fscache_withdraw_object(object);
+ spin_lock(&object->lock);
+ object->state = FSCACHE_OBJECT_DEAD;
+ spin_unlock(&object->lock);
+ fscache_stat(&fscache_n_object_dead);
+ goto terminal_transit;
+
+ /* complain about the object being woken up once it is
+ * deceased */
+ case FSCACHE_OBJECT_DEAD:
+ printk(KERN_ERR "FS-Cache:"
+ " Unexpected event in dead state %lx\n",
+ object->events & object->event_mask);
+ BUG();
+
+ default:
+ printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
+ object->state);
+ BUG();
+ }
+
+ /* determine the transition from a lookup state */
+lookup_transit:
+ switch (fls(object->events & object->event_mask) - 1) {
+ case FSCACHE_OBJECT_EV_WITHDRAW:
+ case FSCACHE_OBJECT_EV_RETIRE:
+ case FSCACHE_OBJECT_EV_RELEASE:
+ case FSCACHE_OBJECT_EV_ERROR:
+ new_state = FSCACHE_OBJECT_LC_DYING;
+ goto change_state;
+ case FSCACHE_OBJECT_EV_REQUEUE:
+ goto done;
+ case -1:
+ goto done; /* sleep until event */
+ default:
+ goto unsupported_event;
+ }
+
+ /* determine the transition from an active state */
+active_transit:
+ switch (fls(object->events & object->event_mask) - 1) {
+ case FSCACHE_OBJECT_EV_WITHDRAW:
+ case FSCACHE_OBJECT_EV_RETIRE:
+ case FSCACHE_OBJECT_EV_RELEASE:
+ case FSCACHE_OBJECT_EV_ERROR:
+ new_state = FSCACHE_OBJECT_DYING;
+ goto change_state;
+ case FSCACHE_OBJECT_EV_UPDATE:
+ new_state = FSCACHE_OBJECT_UPDATING;
+ goto change_state;
+ case -1:
+ new_state = FSCACHE_OBJECT_ACTIVE;
+ goto change_state; /* sleep until event */
+ default:
+ goto unsupported_event;
+ }
+
+ /* determine the transition from a terminal state */
+terminal_transit:
+ switch (fls(object->events & object->event_mask) - 1) {
+ case FSCACHE_OBJECT_EV_WITHDRAW:
+ new_state = FSCACHE_OBJECT_WITHDRAWING;
+ goto change_state;
+ case FSCACHE_OBJECT_EV_RETIRE:
+ new_state = FSCACHE_OBJECT_RECYCLING;
+ goto change_state;
+ case FSCACHE_OBJECT_EV_RELEASE:
+ new_state = FSCACHE_OBJECT_RELEASING;
+ goto change_state;
+ case FSCACHE_OBJECT_EV_ERROR:
+ new_state = FSCACHE_OBJECT_WITHDRAWING;
+ goto change_state;
+ case FSCACHE_OBJECT_EV_CLEARED:
+ new_state = FSCACHE_OBJECT_DYING;
+ goto change_state;
+ case -1:
+ goto done; /* sleep until event */
+ default:
+ goto unsupported_event;
+ }
+
+change_state:
+ spin_lock(&object->lock);
+ object->state = new_state;
+ spin_unlock(&object->lock);
+
+done:
+ _leave(" [->%s]", fscache_object_states[object->state]);
+ return;
+
+unsupported_event:
+ printk(KERN_ERR "FS-Cache:"
+ " Unsupported event %lx [mask %lx] in state %s\n",
+ object->events, object->event_mask,
+ fscache_object_states[object->state]);
+ BUG();
+}
+
+/*
+ * execute an object
+ */
+static void fscache_object_slow_work_execute(struct slow_work *work)
+{
+ struct fscache_object *object =
+ container_of(work, struct fscache_object, work);
+ unsigned long start;
+
+ _enter("{OBJ%x}", object->debug_id);
+
+ clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+
+ start = jiffies;
+ fscache_object_state_machine(object);
+ fscache_hist(fscache_objs_histogram, start);
+ if (object->events & object->event_mask)
+ fscache_enqueue_object(object);
+}
+
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ * immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ * for the parent's lookup and creation to complete if it's not there yet
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ * leaf-most cookies of the object and all its children
+ */
+static void fscache_initialise_object(struct fscache_object *object)
+{
+ struct fscache_object *parent;
+
+ _enter("");
+ ASSERT(object->cookie != NULL);
+ ASSERT(object->cookie->parent != NULL);
+ ASSERT(list_empty(&object->work.link));
+
+ if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
+ (1 << FSCACHE_OBJECT_EV_RELEASE) |
+ (1 << FSCACHE_OBJECT_EV_RETIRE) |
+ (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
+ _debug("abort init %lx", object->events);
+ spin_lock(&object->lock);
+ object->state = FSCACHE_OBJECT_ABORT_INIT;
+ spin_unlock(&object->lock);
+ return;
+ }
+
+ spin_lock(&object->cookie->lock);
+ spin_lock_nested(&object->cookie->parent->lock, 1);
+
+ parent = object->parent;
+ if (!parent) {
+ _debug("no parent");
+ set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+ } else {
+ spin_lock(&object->lock);
+ spin_lock_nested(&parent->lock, 1);
+ _debug("parent %s", fscache_object_states[parent->state]);
+
+ if (parent->state >= FSCACHE_OBJECT_DYING) {
+ _debug("bad parent");
+ set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+ } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
+ _debug("wait");
+
+ /* we may get woken up in this state by child objects
+ * binding on to us, so we need to make sure we don't
+ * add ourself to the list multiple times */
+ if (list_empty(&object->dep_link)) {
+ object->cache->ops->grab_object(object);
+ list_add(&object->dep_link,
+ &parent->dependents);
+
+ /* fscache_acquire_non_index_cookie() uses this
+ * to wake the chain up */
+ if (parent->state == FSCACHE_OBJECT_INIT)
+ fscache_enqueue_object(parent);
+ }
+ } else {
+ _debug("go");
+ parent->n_ops++;
+ parent->n_obj_ops++;
+ object->lookup_jif = jiffies;
+ object->state = FSCACHE_OBJECT_LOOKING_UP;
+ set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ }
+
+ spin_unlock(&parent->lock);
+ spin_unlock(&object->lock);
+ }
+
+ spin_unlock(&object->cookie->parent->lock);
+ spin_unlock(&object->cookie->lock);
+ _leave("");
+}
+
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ * be withdrawn by either party till we've finished
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ * leaf-most cookies of the object and all its children
+ */
+static void fscache_lookup_object(struct fscache_object *object)
+{
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_object *parent;
+
+ _enter("");
+
+ parent = object->parent;
+ ASSERT(parent != NULL);
+ ASSERTCMP(parent->n_ops, >, 0);
+ ASSERTCMP(parent->n_obj_ops, >, 0);
+
+ /* make sure the parent is still available */
+ ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+
+ if (parent->state >= FSCACHE_OBJECT_DYING ||
+ test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+ _debug("unavailable");
+ set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+ _leave("");
+ return;
+ }
+
+ _debug("LOOKUP \"%s/%s\" in \"%s\"",
+ parent->cookie->def->name, cookie->def->name,
+ object->cache->tag->name);
+
+ fscache_stat(&fscache_n_object_lookups);
+ object->cache->ops->lookup_object(object);
+
+ if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+ set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+ _leave("");
+}
+
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+ struct fscache_cookie *cookie = object->cookie;
+
+ _enter("{OBJ%x,%s}",
+ object->debug_id, fscache_object_states[object->state]);
+
+ spin_lock(&object->lock);
+ if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ fscache_stat(&fscache_n_object_lookups_negative);
+
+ /* transit here to allow write requests to begin stacking up
+ * and read requests to begin returning ENODATA */
+ object->state = FSCACHE_OBJECT_CREATING;
+ spin_unlock(&object->lock);
+
+ set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+ _debug("wake up lookup %p", &cookie->flags);
+ smp_mb__before_clear_bit();
+ clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+ set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ } else {
+ ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+ spin_unlock(&object->lock);
+ }
+
+ _leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+ struct fscache_cookie *cookie = object->cookie;
+
+ _enter("{OBJ%x,%s}",
+ object->debug_id, fscache_object_states[object->state]);
+
+ /* if we were still looking up, then we must have a positive lookup
+ * result, in which case there may be data available */
+ spin_lock(&object->lock);
+ if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+ fscache_stat(&fscache_n_object_lookups_positive);
+
+ clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+ object->state = FSCACHE_OBJECT_AVAILABLE;
+ spin_unlock(&object->lock);
+
+ smp_mb__before_clear_bit();
+ clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+ set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ } else {
+ ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+ fscache_stat(&fscache_n_object_created);
+
+ object->state = FSCACHE_OBJECT_AVAILABLE;
+ spin_unlock(&object->lock);
+ set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+ smp_wmb();
+ }
+
+ if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
+
+ _leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+
+/*
+ * handle an object that has just become available
+ */
+static void fscache_object_available(struct fscache_object *object)
+{
+ _enter("{OBJ%x}", object->debug_id);
+
+ spin_lock(&object->lock);
+
+ if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+ wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+
+ fscache_done_parent_op(object);
+ if (object->n_in_progress == 0) {
+ if (object->n_ops > 0) {
+ ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+ ASSERTIF(object->n_ops > object->n_obj_ops,
+ !list_empty(&object->pending_ops));
+ fscache_start_operations(object);
+ } else {
+ ASSERT(list_empty(&object->pending_ops));
+ }
+ }
+ spin_unlock(&object->lock);
+
+ object->cache->ops->lookup_complete(object);
+ fscache_enqueue_dependents(object);
+
+ fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+ fscache_stat(&fscache_n_object_avail);
+
+ _leave("");
+}
+
+/*
+ * drop an object's attachments
+ */
+static void fscache_drop_object(struct fscache_object *object)
+{
+ struct fscache_object *parent = object->parent;
+ struct fscache_cache *cache = object->cache;
+
+ _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+
+ spin_lock(&cache->object_list_lock);
+ list_del_init(&object->cache_link);
+ spin_unlock(&cache->object_list_lock);
+
+ cache->ops->drop_object(object);
+
+ if (parent) {
+ _debug("release parent OBJ%x {%d}",
+ parent->debug_id, parent->n_children);
+
+ spin_lock(&parent->lock);
+ parent->n_children--;
+ if (parent->n_children == 0)
+ fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+ spin_unlock(&parent->lock);
+ object->parent = NULL;
+ }
+
+ /* this just shifts the object release to the slow work processor */
+ object->cache->ops->put_object(object);
+
+ _leave("");
+}
+
+/*
+ * release or recycle an object that the netfs has discarded
+ */
+static void fscache_release_object(struct fscache_object *object)
+{
+ _enter("");
+
+ fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service
+ */
+static void fscache_withdraw_object(struct fscache_object *object)
+{
+ struct fscache_cookie *cookie;
+ bool detached;
+
+ _enter("");
+
+ spin_lock(&object->lock);
+ cookie = object->cookie;
+ if (cookie) {
+ /* need to get the cookie lock before the object lock, starting
+ * from the object pointer */
+ atomic_inc(&cookie->usage);
+ spin_unlock(&object->lock);
+
+ detached = false;
+ spin_lock(&cookie->lock);
+ spin_lock(&object->lock);
+
+ if (object->cookie == cookie) {
+ hlist_del_init(&object->cookie_link);
+ object->cookie = NULL;
+ detached = true;
+ }
+ spin_unlock(&cookie->lock);
+ fscache_cookie_put(cookie);
+ if (detached)
+ fscache_cookie_put(cookie);
+ }
+
+ spin_unlock(&object->lock);
+
+ fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service at the behest of the cache
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ * (1) recycler decides to reclaim an in-use object
+ * (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ * simultaneously
+ * - the object is pinned by the caller holding a refcount on it
+ */
+void fscache_withdrawing_object(struct fscache_cache *cache,
+ struct fscache_object *object)
+{
+ bool enqueue = false;
+
+ _enter(",OBJ%x", object->debug_id);
+
+ spin_lock(&object->lock);
+ if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+ object->state = FSCACHE_OBJECT_WITHDRAWING;
+ enqueue = true;
+ }
+ spin_unlock(&object->lock);
+
+ if (enqueue)
+ fscache_enqueue_object(object);
+
+ _leave("");
+}
+
+/*
+ * allow the slow work item processor to get a ref on an object
+ */
+static int fscache_object_slow_work_get_ref(struct slow_work *work)
+{
+ struct fscache_object *object =
+ container_of(work, struct fscache_object, work);
+
+ return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+}
+
+/*
+ * allow the slow work item processor to discard a ref on a work item
+ */
+static void fscache_object_slow_work_put_ref(struct slow_work *work)
+{
+ struct fscache_object *object =
+ container_of(work, struct fscache_object, work);
+
+ return object->cache->ops->put_object(object);
+}
+
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+ _enter("{OBJ%x}", object->debug_id);
+
+ slow_work_enqueue(&object->work);
+}
+
+/*
+ * enqueue the dependents of an object for metadata-type processing
+ * - the caller must hold the object's lock
+ * - this may cause an already locked object to wind up being processed again
+ */
+static void fscache_enqueue_dependents(struct fscache_object *object)
+{
+ struct fscache_object *dep;
+
+ _enter("{OBJ%x}", object->debug_id);
+
+ if (list_empty(&object->dependents))
+ return;
+
+ spin_lock(&object->lock);
+
+ while (!list_empty(&object->dependents)) {
+ dep = list_entry(object->dependents.next,
+ struct fscache_object, dep_link);
+ list_del_init(&dep->dep_link);
+
+
+ /* sort onto appropriate lists */
+ fscache_enqueue_object(dep);
+ dep->cache->ops->put_object(dep);
+
+ if (!list_empty(&object->dependents))
+ cond_resched_lock(&object->lock);
+ }
+
+ spin_unlock(&object->lock);
+}
+
+/*
+ * remove an object from whatever queue it's waiting on
+ * - the caller must hold object->lock
+ */
+void fscache_dequeue_object(struct fscache_object *object)
+{
+ _enter("{OBJ%x}", object->debug_id);
+
+ if (!list_empty(&object->dep_link)) {
+ spin_lock(&object->parent->lock);
+ list_del_init(&object->dep_link);
+ spin_unlock(&object->parent->lock);
+ }
+
+ _leave("");
+}
+
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+ const void *data, uint16_t datalen)
+{
+ enum fscache_checkaux result;
+
+ if (!object->cookie->def->check_aux) {
+ fscache_stat(&fscache_n_checkaux_none);
+ return FSCACHE_CHECKAUX_OKAY;
+ }
+
+ result = object->cookie->def->check_aux(object->cookie->netfs_data,
+ data, datalen);
+ switch (result) {
+ /* entry okay as is */
+ case FSCACHE_CHECKAUX_OKAY:
+ fscache_stat(&fscache_n_checkaux_okay);
+ break;
+
+ /* entry requires update */
+ case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+ fscache_stat(&fscache_n_checkaux_update);
+ break;
+
+ /* entry requires deletion */
+ case FSCACHE_CHECKAUX_OBSOLETE:
+ fscache_stat(&fscache_n_checkaux_obsolete);
+ break;
+
+ default:
+ BUG();
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644
index 00000000000..e7f8d53b8b6
--- /dev/null
+++ b/fs/fscache/operation.c
@@ -0,0 +1,459 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include "internal.h"
+
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+ _enter("{OBJ%x OP%x,%u}",
+ op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+ ASSERT(op->processor != NULL);
+ ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+ if (list_empty(&op->pend_link)) {
+ switch (op->flags & FSCACHE_OP_TYPE) {
+ case FSCACHE_OP_FAST:
+ _debug("queue fast");
+ atomic_inc(&op->usage);
+ if (!schedule_work(&op->fast_work))
+ fscache_put_operation(op);
+ break;
+ case FSCACHE_OP_SLOW:
+ _debug("queue slow");
+ slow_work_enqueue(&op->slow_work);
+ break;
+ case FSCACHE_OP_MYTHREAD:
+ _debug("queue for caller's attention");
+ break;
+ default:
+ printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+ op->flags);
+ BUG();
+ break;
+ }
+ fscache_stat(&fscache_n_op_enqueue);
+ }
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+ struct fscache_operation *op)
+{
+ object->n_in_progress++;
+ if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+ wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+ if (op->processor)
+ fscache_enqueue_operation(op);
+ fscache_stat(&fscache_n_op_run);
+}
+
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+ struct fscache_operation *op)
+{
+ int ret;
+
+ _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+
+ spin_lock(&object->lock);
+ ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+ ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+
+ ret = -ENOBUFS;
+ if (fscache_object_is_active(object)) {
+ op->object = object;
+ object->n_ops++;
+ object->n_exclusive++; /* reads and writes must wait */
+
+ if (object->n_ops > 0) {
+ atomic_inc(&op->usage);
+ list_add_tail(&op->pend_link, &object->pending_ops);
+ fscache_stat(&fscache_n_op_pend);
+ } else if (!list_empty(&object->pending_ops)) {
+ atomic_inc(&op->usage);
+ list_add_tail(&op->pend_link, &object->pending_ops);
+ fscache_stat(&fscache_n_op_pend);
+ fscache_start_operations(object);
+ } else {
+ ASSERTCMP(object->n_in_progress, ==, 0);
+ fscache_run_op(object, op);
+ }
+
+ /* need to issue a new write op after this */
+ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+ ret = 0;
+ } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ op->object = object;
+ object->n_ops++;
+ object->n_exclusive++; /* reads and writes must wait */
+ atomic_inc(&op->usage);
+ list_add_tail(&op->pend_link, &object->pending_ops);
+ fscache_stat(&fscache_n_op_pend);
+ ret = 0;
+ } else {
+ /* not allowed to submit ops in any other state */
+ BUG();
+ }
+
+ spin_unlock(&object->lock);
+ return ret;
+}
+
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+ struct fscache_operation *op,
+ unsigned long ostate)
+{
+ static bool once_only;
+ struct fscache_operation *p;
+ unsigned n;
+
+ if (once_only)
+ return;
+ once_only = true;
+
+ kdebug("unexpected submission OP%x [OBJ%x %s]",
+ op->debug_id, object->debug_id,
+ fscache_object_states[object->state]);
+ kdebug("objstate=%s [%s]",
+ fscache_object_states[object->state],
+ fscache_object_states[ostate]);
+ kdebug("objflags=%lx", object->flags);
+ kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+ kdebug("ops=%u inp=%u exc=%u",
+ object->n_ops, object->n_in_progress, object->n_exclusive);
+
+ if (!list_empty(&object->pending_ops)) {
+ n = 0;
+ list_for_each_entry(p, &object->pending_ops, pend_link) {
+ ASSERTCMP(p->object, ==, object);
+ kdebug("%p %p", op->processor, op->release);
+ n++;
+ }
+
+ kdebug("n=%u", n);
+ }
+
+ dump_stack();
+}
+
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ * - during object creation (write ops may be submitted)
+ * - whilst the object is active
+ * - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+ struct fscache_operation *op)
+{
+ unsigned long ostate;
+ int ret;
+
+ _enter("{OBJ%x OP%x},{%u}",
+ object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+ spin_lock(&object->lock);
+ ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+ ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+
+ ostate = object->state;
+ smp_rmb();
+
+ if (fscache_object_is_active(object)) {
+ op->object = object;
+ object->n_ops++;
+
+ if (object->n_exclusive > 0) {
+ atomic_inc(&op->usage);
+ list_add_tail(&op->pend_link, &object->pending_ops);
+ fscache_stat(&fscache_n_op_pend);
+ } else if (!list_empty(&object->pending_ops)) {
+ atomic_inc(&op->usage);
+ list_add_tail(&op->pend_link, &object->pending_ops);
+ fscache_stat(&fscache_n_op_pend);
+ fscache_start_operations(object);
+ } else {
+ ASSERTCMP(object->n_exclusive, ==, 0);
+ fscache_run_op(object, op);
+ }
+ ret = 0;
+ } else if (object->state == FSCACHE_OBJECT_CREATING) {
+ op->object = object;
+ object->n_ops++;
+ atomic_inc(&op->usage);
+ list_add_tail(&op->pend_link, &object->pending_ops);
+ fscache_stat(&fscache_n_op_pend);
+ ret = 0;
+ } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+ fscache_report_unexpected_submission(object, op, ostate);
+ ASSERT(!fscache_object_is_active(object));
+ ret = -ENOBUFS;
+ } else {
+ ret = -ENOBUFS;
+ }
+
+ spin_unlock(&object->lock);
+ return ret;
+}
+
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+ _enter("{OBJ%x}", object->debug_id);
+
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+
+/*
+ * jump start the operation processing on an object
+ * - caller must hold object->lock
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+ struct fscache_operation *op;
+ bool stop = false;
+
+ while (!list_empty(&object->pending_ops) && !stop) {
+ op = list_entry(object->pending_ops.next,
+ struct fscache_operation, pend_link);
+
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+ if (object->n_in_progress > 0)
+ break;
+ stop = true;
+ }
+ list_del_init(&op->pend_link);
+ object->n_in_progress++;
+
+ if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+ wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+ if (op->processor)
+ fscache_enqueue_operation(op);
+
+ /* the pending queue was holding a ref on the object */
+ fscache_put_operation(op);
+ }
+
+ ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+
+ _debug("woke %d ops on OBJ%x",
+ object->n_in_progress, object->debug_id);
+}
+
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+ struct fscache_object *object;
+ struct fscache_cache *cache;
+
+ _enter("{OBJ%x OP%x,%d}",
+ op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+ if (!atomic_dec_and_test(&op->usage))
+ return;
+
+ _debug("PUT OP");
+ if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+ BUG();
+
+ fscache_stat(&fscache_n_op_release);
+
+ if (op->release) {
+ op->release(op);
+ op->release = NULL;
+ }
+
+ object = op->object;
+
+ /* now... we may get called with the object spinlock held, so we
+ * complete the cleanup here only if we can immediately acquire the
+ * lock, and defer it otherwise */
+ if (!spin_trylock(&object->lock)) {
+ _debug("defer put");
+ fscache_stat(&fscache_n_op_deferred_release);
+
+ cache = object->cache;
+ spin_lock(&cache->op_gc_list_lock);
+ list_add_tail(&op->pend_link, &cache->op_gc_list);
+ spin_unlock(&cache->op_gc_list_lock);
+ schedule_work(&cache->op_gc);
+ _leave(" [defer]");
+ return;
+ }
+
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+ ASSERTCMP(object->n_exclusive, >, 0);
+ object->n_exclusive--;
+ }
+
+ ASSERTCMP(object->n_in_progress, >, 0);
+ object->n_in_progress--;
+ if (object->n_in_progress == 0)
+ fscache_start_operations(object);
+
+ ASSERTCMP(object->n_ops, >, 0);
+ object->n_ops--;
+ if (object->n_ops == 0)
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+ spin_unlock(&object->lock);
+
+ kfree(op);
+ _leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+ struct fscache_operation *op;
+ struct fscache_object *object;
+ struct fscache_cache *cache =
+ container_of(work, struct fscache_cache, op_gc);
+ int count = 0;
+
+ _enter("");
+
+ do {
+ spin_lock(&cache->op_gc_list_lock);
+ if (list_empty(&cache->op_gc_list)) {
+ spin_unlock(&cache->op_gc_list_lock);
+ break;
+ }
+
+ op = list_entry(cache->op_gc_list.next,
+ struct fscache_operation, pend_link);
+ list_del(&op->pend_link);
+ spin_unlock(&cache->op_gc_list_lock);
+
+ object = op->object;
+
+ _debug("GC DEFERRED REL OBJ%x OP%x",
+ object->debug_id, op->debug_id);
+ fscache_stat(&fscache_n_op_gc);
+
+ ASSERTCMP(atomic_read(&op->usage), ==, 0);
+
+ spin_lock(&object->lock);
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+ ASSERTCMP(object->n_exclusive, >, 0);
+ object->n_exclusive--;
+ }
+
+ ASSERTCMP(object->n_in_progress, >, 0);
+ object->n_in_progress--;
+ if (object->n_in_progress == 0)
+ fscache_start_operations(object);
+
+ ASSERTCMP(object->n_ops, >, 0);
+ object->n_ops--;
+ if (object->n_ops == 0)
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+ spin_unlock(&object->lock);
+
+ } while (count++ < 20);
+
+ if (!list_empty(&cache->op_gc_list))
+ schedule_work(&cache->op_gc);
+
+ _leave("");
+}
+
+/*
+ * allow the slow work item processor to get a ref on an operation
+ */
+static int fscache_op_get_ref(struct slow_work *work)
+{
+ struct fscache_operation *op =
+ container_of(work, struct fscache_operation, slow_work);
+
+ atomic_inc(&op->usage);
+ return 0;
+}
+
+/*
+ * allow the slow work item processor to discard a ref on an operation
+ */
+static void fscache_op_put_ref(struct slow_work *work)
+{
+ struct fscache_operation *op =
+ container_of(work, struct fscache_operation, slow_work);
+
+ fscache_put_operation(op);
+}
+
+/*
+ * execute an operation using the slow thread pool to provide processing context
+ * - the caller holds a ref to this object, so we don't need to hold one
+ */
+static void fscache_op_execute(struct slow_work *work)
+{
+ struct fscache_operation *op =
+ container_of(work, struct fscache_operation, slow_work);
+ unsigned long start;
+
+ _enter("{OBJ%x OP%x,%d}",
+ op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+ ASSERT(op->processor != NULL);
+ start = jiffies;
+ op->processor(op);
+ fscache_hist(fscache_ops_histogram, start);
+
+ _leave("");
+}
+
+const struct slow_work_ops fscache_op_slow_work_ops = {
+ .get_ref = fscache_op_get_ref,
+ .put_ref = fscache_op_put_ref,
+ .execute = fscache_op_execute,
+};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644
index 00000000000..2568e0eb644
--- /dev/null
+++ b/fs/fscache/page.c
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+ void *val;
+
+ rcu_read_lock();
+ val = radix_tree_lookup(&cookie->stores, page->index);
+ rcu_read_unlock();
+
+ return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+ wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+ struct page *xpage;
+
+ spin_lock(&cookie->lock);
+ xpage = radix_tree_delete(&cookie->stores, page->index);
+ spin_unlock(&cookie->lock);
+ ASSERT(xpage != NULL);
+
+ wake_up_bit(&cookie->flags, 0);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+ struct fscache_object *object = op->object;
+
+ _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+ fscache_stat(&fscache_n_attr_changed_calls);
+
+ if (fscache_object_is_active(object) &&
+ object->cache->ops->attr_changed(object) < 0)
+ fscache_abort_object(object);
+
+ _leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+ struct fscache_operation *op;
+ struct fscache_object *object;
+
+ _enter("%p", cookie);
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+ fscache_stat(&fscache_n_attr_changed);
+
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op) {
+ fscache_stat(&fscache_n_attr_changed_nomem);
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ fscache_operation_init(op, NULL);
+ fscache_operation_init_slow(op, fscache_attr_changed_op);
+ op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ if (fscache_submit_exclusive_op(object, op) < 0)
+ goto nobufs;
+ spin_unlock(&cookie->lock);
+ fscache_stat(&fscache_n_attr_changed_ok);
+ fscache_put_operation(op);
+ _leave(" = 0");
+ return 0;
+
+nobufs:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+ fscache_stat(&fscache_n_attr_changed_nobufs);
+ _leave(" = %d", -ENOBUFS);
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+ struct fscache_retrieval *op =
+ container_of(work, struct fscache_retrieval, op.fast_work);
+ unsigned long start;
+
+ _enter("{OP%x}", op->op.debug_id);
+
+ start = jiffies;
+ op->op.processor(&op->op);
+ fscache_hist(fscache_ops_histogram, start);
+ fscache_put_operation(&op->op);
+}
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+ struct fscache_retrieval *op =
+ container_of(_op, struct fscache_retrieval, op);
+
+ _enter("{OP%x}", op->op.debug_id);
+
+ fscache_hist(fscache_retrieval_histogram, op->start_time);
+ if (op->context)
+ fscache_put_context(op->op.object->cookie, op->context);
+
+ _leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+ struct address_space *mapping,
+ fscache_rw_complete_t end_io_func,
+ void *context)
+{
+ struct fscache_retrieval *op;
+
+ /* allocate a retrieval operation and attempt to submit it */
+ op = kzalloc(sizeof(*op), GFP_NOIO);
+ if (!op) {
+ fscache_stat(&fscache_n_retrievals_nomem);
+ return NULL;
+ }
+
+ fscache_operation_init(&op->op, fscache_release_retrieval_op);
+ op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+ op->mapping = mapping;
+ op->end_io_func = end_io_func;
+ op->context = context;
+ op->start_time = jiffies;
+ INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+ INIT_LIST_HEAD(&op->to_do);
+ return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+ unsigned long jif;
+
+ _enter("");
+
+ if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+ _leave(" = 0 [imm]");
+ return 0;
+ }
+
+ fscache_stat(&fscache_n_retrievals_wait);
+
+ jif = jiffies;
+ if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+ fscache_wait_bit_interruptible,
+ TASK_INTERRUPTIBLE) != 0) {
+ fscache_stat(&fscache_n_retrievals_intr);
+ _leave(" = -ERESTARTSYS");
+ return -ERESTARTSYS;
+ }
+
+ ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+ smp_rmb();
+ fscache_hist(fscache_retrieval_delay_histogram, jif);
+ _leave(" = 0 [dly]");
+ return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ * -ENOMEM - out of memory, nothing done
+ * -ERESTARTSYS - interrupted
+ * -ENOBUFS - no backing object available in which to cache the block
+ * -ENODATA - no data available in the backing object for this block
+ * 0 - dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+ struct page *page,
+ fscache_rw_complete_t end_io_func,
+ void *context,
+ gfp_t gfp)
+{
+ struct fscache_retrieval *op;
+ struct fscache_object *object;
+ int ret;
+
+ _enter("%p,%p,,,", cookie, page);
+
+ fscache_stat(&fscache_n_retrievals);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs;
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+ ASSERTCMP(page, !=, NULL);
+
+ if (fscache_wait_for_deferred_lookup(cookie) < 0)
+ return -ERESTARTSYS;
+
+ op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+ if (!op) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs_unlock;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+
+ if (fscache_submit_op(object, &op->op) < 0)
+ goto nobufs_unlock;
+ spin_unlock(&cookie->lock);
+
+ fscache_stat(&fscache_n_retrieval_ops);
+
+ /* pin the netfs read context in case we need to do the actual netfs
+ * read because we've encountered a cache read failure */
+ fscache_get_context(object->cookie, op->context);
+
+ /* we wait for the operation to become active, and then process it
+ * *here*, in this thread, and not in the thread pool */
+ if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+ _debug(">>> WT");
+ fscache_stat(&fscache_n_retrieval_op_waits);
+ wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ _debug("<<< GO");
+ }
+
+ /* ask the cache to honour the operation */
+ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+ ret = object->cache->ops->allocate_page(op, page, gfp);
+ if (ret == 0)
+ ret = -ENODATA;
+ } else {
+ ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+ }
+
+ if (ret == -ENOMEM)
+ fscache_stat(&fscache_n_retrievals_nomem);
+ else if (ret == -ERESTARTSYS)
+ fscache_stat(&fscache_n_retrievals_intr);
+ else if (ret == -ENODATA)
+ fscache_stat(&fscache_n_retrievals_nodata);
+ else if (ret < 0)
+ fscache_stat(&fscache_n_retrievals_nobufs);
+ else
+ fscache_stat(&fscache_n_retrievals_ok);
+
+ fscache_put_retrieval(op);
+ _leave(" = %d", ret);
+ return ret;
+
+nobufs_unlock:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+nobufs:
+ fscache_stat(&fscache_n_retrievals_nobufs);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ * -ENOMEM - out of memory, some pages may be being read
+ * -ERESTARTSYS - interrupted, some pages may be being read
+ * -ENOBUFS - no backing object or space available in which to cache any
+ * pages not being read
+ * -ENODATA - no data available in the backing object for some or all of
+ * the pages
+ * 0 - dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages,
+ fscache_rw_complete_t end_io_func,
+ void *context,
+ gfp_t gfp)
+{
+ fscache_pages_retrieval_func_t func;
+ struct fscache_retrieval *op;
+ struct fscache_object *object;
+ int ret;
+
+ _enter("%p,,%d,,,", cookie, *nr_pages);
+
+ fscache_stat(&fscache_n_retrievals);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs;
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+ ASSERTCMP(*nr_pages, >, 0);
+ ASSERT(!list_empty(pages));
+
+ if (fscache_wait_for_deferred_lookup(cookie) < 0)
+ return -ERESTARTSYS;
+
+ op = fscache_alloc_retrieval(mapping, end_io_func, context);
+ if (!op)
+ return -ENOMEM;
+
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs_unlock;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ if (fscache_submit_op(object, &op->op) < 0)
+ goto nobufs_unlock;
+ spin_unlock(&cookie->lock);
+
+ fscache_stat(&fscache_n_retrieval_ops);
+
+ /* pin the netfs read context in case we need to do the actual netfs
+ * read because we've encountered a cache read failure */
+ fscache_get_context(object->cookie, op->context);
+
+ /* we wait for the operation to become active, and then process it
+ * *here*, in this thread, and not in the thread pool */
+ if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+ _debug(">>> WT");
+ fscache_stat(&fscache_n_retrieval_op_waits);
+ wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ _debug("<<< GO");
+ }
+
+ /* ask the cache to honour the operation */
+ if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+ func = object->cache->ops->allocate_pages;
+ else
+ func = object->cache->ops->read_or_alloc_pages;
+ ret = func(op, pages, nr_pages, gfp);
+
+ if (ret == -ENOMEM)
+ fscache_stat(&fscache_n_retrievals_nomem);
+ else if (ret == -ERESTARTSYS)
+ fscache_stat(&fscache_n_retrievals_intr);
+ else if (ret == -ENODATA)
+ fscache_stat(&fscache_n_retrievals_nodata);
+ else if (ret < 0)
+ fscache_stat(&fscache_n_retrievals_nobufs);
+ else
+ fscache_stat(&fscache_n_retrievals_ok);
+
+ fscache_put_retrieval(op);
+ _leave(" = %d", ret);
+ return ret;
+
+nobufs_unlock:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+nobufs:
+ fscache_stat(&fscache_n_retrievals_nobufs);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ * -ENOMEM - out of memory, nothing done
+ * -ERESTARTSYS - interrupted
+ * -ENOBUFS - no backing object available in which to cache the block
+ * 0 - block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp)
+{
+ struct fscache_retrieval *op;
+ struct fscache_object *object;
+ int ret;
+
+ _enter("%p,%p,,,", cookie, page);
+
+ fscache_stat(&fscache_n_allocs);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs;
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+ ASSERTCMP(page, !=, NULL);
+
+ if (fscache_wait_for_deferred_lookup(cookie) < 0)
+ return -ERESTARTSYS;
+
+ op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+ if (!op)
+ return -ENOMEM;
+
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs_unlock;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ if (fscache_submit_op(object, &op->op) < 0)
+ goto nobufs_unlock;
+ spin_unlock(&cookie->lock);
+
+ fscache_stat(&fscache_n_alloc_ops);
+
+ if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+ _debug(">>> WT");
+ fscache_stat(&fscache_n_alloc_op_waits);
+ wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ _debug("<<< GO");
+ }
+
+ /* ask the cache to honour the operation */
+ ret = object->cache->ops->allocate_page(op, page, gfp);
+
+ if (ret < 0)
+ fscache_stat(&fscache_n_allocs_nobufs);
+ else
+ fscache_stat(&fscache_n_allocs_ok);
+
+ fscache_put_retrieval(op);
+ _leave(" = %d", ret);
+ return ret;
+
+nobufs_unlock:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+nobufs:
+ fscache_stat(&fscache_n_allocs_nobufs);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+ _enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+ struct fscache_storage *op =
+ container_of(_op, struct fscache_storage, op);
+ struct fscache_object *object = op->op.object;
+ struct fscache_cookie *cookie = object->cookie;
+ struct page *page;
+ unsigned n;
+ void *results[1];
+ int ret;
+
+ _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+ spin_lock(&cookie->lock);
+ spin_lock(&object->lock);
+
+ if (!fscache_object_is_active(object)) {
+ spin_unlock(&object->lock);
+ spin_unlock(&cookie->lock);
+ _leave("");
+ return;
+ }
+
+ fscache_stat(&fscache_n_store_calls);
+
+ /* find a page to store */
+ page = NULL;
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+ FSCACHE_COOKIE_PENDING_TAG);
+ if (n != 1)
+ goto superseded;
+ page = results[0];
+ _debug("gang %d [%lx]", n, page->index);
+ if (page->index > op->store_limit)
+ goto superseded;
+
+ radix_tree_tag_clear(&cookie->stores, page->index,
+ FSCACHE_COOKIE_PENDING_TAG);
+
+ spin_unlock(&object->lock);
+ spin_unlock(&cookie->lock);
+
+ if (page) {
+ ret = object->cache->ops->write_page(op, page);
+ fscache_end_page_write(cookie, page);
+ page_cache_release(page);
+ if (ret < 0)
+ fscache_abort_object(object);
+ else
+ fscache_enqueue_operation(&op->op);
+ }
+
+ _leave("");
+ return;
+
+superseded:
+ /* this writer is going away and there aren't any more things to
+ * write */
+ _debug("cease");
+ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+ spin_unlock(&object->lock);
+ spin_unlock(&cookie->lock);
+ _leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ * -ENOMEM - out of memory, nothing done
+ * -ENOBUFS - no backing object available in which to cache the page
+ * 0 - dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ * set)
+ *
+ * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ * fill op)
+ *
+ * (b) writes deferred till post-creation (mark page for writing and
+ * return immediately)
+ *
+ * (2) negative lookup, object created, initial fill being made from netfs
+ * (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ * (a) fill point not yet reached this page (mark page for writing and
+ * return)
+ *
+ * (b) fill point passed this page (queue op to store this page)
+ *
+ * (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp)
+{
+ struct fscache_storage *op;
+ struct fscache_object *object;
+ int ret;
+
+ _enter("%p,%x,", cookie, (u32) page->flags);
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+ ASSERT(PageFsCache(page));
+
+ fscache_stat(&fscache_n_stores);
+
+ op = kzalloc(sizeof(*op), GFP_NOIO);
+ if (!op)
+ goto nomem;
+
+ fscache_operation_init(&op->op, fscache_release_write_op);
+ fscache_operation_init_slow(&op->op, fscache_write_op);
+ op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+
+ ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+ if (ret < 0)
+ goto nomem_free;
+
+ ret = -ENOBUFS;
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+ if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+ goto nobufs;
+
+ /* add the page to the pending-storage radix tree on the backing
+ * object */
+ spin_lock(&object->lock);
+
+ _debug("store limit %llx", (unsigned long long) object->store_limit);
+
+ ret = radix_tree_insert(&cookie->stores, page->index, page);
+ if (ret < 0) {
+ if (ret == -EEXIST)
+ goto already_queued;
+ _debug("insert failed %d", ret);
+ goto nobufs_unlock_obj;
+ }
+
+ radix_tree_tag_set(&cookie->stores, page->index,
+ FSCACHE_COOKIE_PENDING_TAG);
+ page_cache_get(page);
+
+ /* we only want one writer at a time, but we do need to queue new
+ * writers after exclusive ops */
+ if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+ goto already_pending;
+
+ spin_unlock(&object->lock);
+
+ op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
+ op->store_limit = object->store_limit;
+
+ if (fscache_submit_op(object, &op->op) < 0)
+ goto submit_failed;
+
+ spin_unlock(&cookie->lock);
+ radix_tree_preload_end();
+ fscache_stat(&fscache_n_store_ops);
+ fscache_stat(&fscache_n_stores_ok);
+
+ /* the slow work queue now carries its own ref on the object */
+ fscache_put_operation(&op->op);
+ _leave(" = 0");
+ return 0;
+
+already_queued:
+ fscache_stat(&fscache_n_stores_again);
+already_pending:
+ spin_unlock(&object->lock);
+ spin_unlock(&cookie->lock);
+ radix_tree_preload_end();
+ kfree(op);
+ fscache_stat(&fscache_n_stores_ok);
+ _leave(" = 0");
+ return 0;
+
+submit_failed:
+ radix_tree_delete(&cookie->stores, page->index);
+ page_cache_release(page);
+ ret = -ENOBUFS;
+ goto nobufs;
+
+nobufs_unlock_obj:
+ spin_unlock(&object->lock);
+nobufs:
+ spin_unlock(&cookie->lock);
+ radix_tree_preload_end();
+ kfree(op);
+ fscache_stat(&fscache_n_stores_nobufs);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+
+nomem_free:
+ kfree(op);
+nomem:
+ fscache_stat(&fscache_n_stores_oom);
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+ struct fscache_object *object;
+
+ _enter(",%p", page);
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+ ASSERTCMP(page, !=, NULL);
+
+ fscache_stat(&fscache_n_uncaches);
+
+ /* cache withdrawal may beat us to it */
+ if (!PageFsCache(page))
+ goto done;
+
+ /* get the object */
+ spin_lock(&cookie->lock);
+
+ if (hlist_empty(&cookie->backing_objects)) {
+ ClearPageFsCache(page);
+ goto done_unlock;
+ }
+
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ /* there might now be stuff on disk we could read */
+ clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+ /* only invoke the cache backend if we managed to mark the page
+ * uncached here; this deals with synchronisation vs withdrawal */
+ if (TestClearPageFsCache(page) &&
+ object->cache->ops->uncache_page) {
+ /* the cache backend releases the cookie lock */
+ object->cache->ops->uncache_page(object, page);
+ goto done;
+ }
+
+done_unlock:
+ spin_unlock(&cookie->lock);
+done:
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached. After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+ struct pagevec *pagevec)
+{
+ struct fscache_cookie *cookie = op->op.object->cookie;
+ unsigned long loop;
+
+#ifdef CONFIG_FSCACHE_STATS
+ atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+
+ for (loop = 0; loop < pagevec->nr; loop++) {
+ struct page *page = pagevec->pages[loop];
+
+ _debug("- mark %p{%lx}", page, page->index);
+ if (TestSetPageFsCache(page)) {
+ static bool once_only;
+ if (!once_only) {
+ once_only = true;
+ printk(KERN_WARNING "FS-Cache:"
+ " Cookie type %s marked page %lx"
+ " multiple times\n",
+ cookie->def->name, page->index);
+ }
+ }
+ }
+
+ if (cookie->def->mark_pages_cached)
+ cookie->def->mark_pages_cached(cookie->netfs_data,
+ op->mapping, pagevec);
+ pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644
index 00000000000..beeab44bc31
--- /dev/null
+++ b/fs/fscache/proc.c
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+ _enter("");
+
+ if (!proc_mkdir("fs/fscache", NULL))
+ goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+ if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+ &fscache_stats_fops))
+ goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+ if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+ &fscache_histogram_fops))
+ goto error_histogram;
+#endif
+
+ _leave(" = 0");
+ return 0;
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+ remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+ remove_proc_entry("fs/fscache", NULL);
+error_dir:
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+ remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+ remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+ remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644
index 00000000000..65deb99e756
--- /dev/null
+++ b/fs/fscache/stats.c
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "FS-Cache statistics\n");
+
+ seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+ atomic_read(&fscache_n_cookie_index),
+ atomic_read(&fscache_n_cookie_data),
+ atomic_read(&fscache_n_cookie_special));
+
+ seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+ atomic_read(&fscache_n_object_alloc),
+ atomic_read(&fscache_n_object_no_alloc),
+ atomic_read(&fscache_n_object_avail),
+ atomic_read(&fscache_n_object_dead));
+ seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+ atomic_read(&fscache_n_checkaux_none),
+ atomic_read(&fscache_n_checkaux_okay),
+ atomic_read(&fscache_n_checkaux_update),
+ atomic_read(&fscache_n_checkaux_obsolete));
+
+ seq_printf(m, "Pages : mrk=%u unc=%u\n",
+ atomic_read(&fscache_n_marks),
+ atomic_read(&fscache_n_uncaches));
+
+ seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+ " oom=%u\n",
+ atomic_read(&fscache_n_acquires),
+ atomic_read(&fscache_n_acquires_null),
+ atomic_read(&fscache_n_acquires_no_cache),
+ atomic_read(&fscache_n_acquires_ok),
+ atomic_read(&fscache_n_acquires_nobufs),
+ atomic_read(&fscache_n_acquires_oom));
+
+ seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+ atomic_read(&fscache_n_object_lookups),
+ atomic_read(&fscache_n_object_lookups_negative),
+ atomic_read(&fscache_n_object_lookups_positive),
+ atomic_read(&fscache_n_object_created));
+
+ seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+ atomic_read(&fscache_n_updates),
+ atomic_read(&fscache_n_updates_null),
+ atomic_read(&fscache_n_updates_run));
+
+ seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+ atomic_read(&fscache_n_relinquishes),
+ atomic_read(&fscache_n_relinquishes_null),
+ atomic_read(&fscache_n_relinquishes_waitcrt));
+
+ seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+ atomic_read(&fscache_n_attr_changed),
+ atomic_read(&fscache_n_attr_changed_ok),
+ atomic_read(&fscache_n_attr_changed_nobufs),
+ atomic_read(&fscache_n_attr_changed_nomem),
+ atomic_read(&fscache_n_attr_changed_calls));
+
+ seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+ atomic_read(&fscache_n_allocs),
+ atomic_read(&fscache_n_allocs_ok),
+ atomic_read(&fscache_n_allocs_wait),
+ atomic_read(&fscache_n_allocs_nobufs));
+ seq_printf(m, "Allocs : ops=%u owt=%u\n",
+ atomic_read(&fscache_n_alloc_ops),
+ atomic_read(&fscache_n_alloc_op_waits));
+
+ seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+ " int=%u oom=%u\n",
+ atomic_read(&fscache_n_retrievals),
+ atomic_read(&fscache_n_retrievals_ok),
+ atomic_read(&fscache_n_retrievals_wait),
+ atomic_read(&fscache_n_retrievals_nodata),
+ atomic_read(&fscache_n_retrievals_nobufs),
+ atomic_read(&fscache_n_retrievals_intr),
+ atomic_read(&fscache_n_retrievals_nomem));
+ seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+ atomic_read(&fscache_n_retrieval_ops),
+ atomic_read(&fscache_n_retrieval_op_waits));
+
+ seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+ atomic_read(&fscache_n_stores),
+ atomic_read(&fscache_n_stores_ok),
+ atomic_read(&fscache_n_stores_again),
+ atomic_read(&fscache_n_stores_nobufs),
+ atomic_read(&fscache_n_stores_oom));
+ seq_printf(m, "Stores : ops=%u run=%u\n",
+ atomic_read(&fscache_n_store_ops),
+ atomic_read(&fscache_n_store_calls));
+
+ seq_printf(m, "Ops : pend=%u run=%u enq=%u\n",
+ atomic_read(&fscache_n_op_pend),
+ atomic_read(&fscache_n_op_run),
+ atomic_read(&fscache_n_op_enqueue));
+ seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
+ atomic_read(&fscache_n_op_deferred_release),
+ atomic_read(&fscache_n_op_release),
+ atomic_read(&fscache_n_op_gc));
+ return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = fscache_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719b..4e340fedf76 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
* - sync(2)
* - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
*/
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
/*
* Don't use page->mapping as it may become NULL from a
* concurrent truncate.
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e74..e0b53aa7bbe 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
mode_t mode = inode->i_mode;
int error;
- inode->i_mode = mode & ~current->fs->umask;
+ inode->i_mode = mode & ~current_umask();
if (!S_ISLNK(inode->i_mode))
acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa76..fa881bdc3d8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
if (error)
return error;
if (!acl) {
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
if (mode != ip->i_inode.i_mode)
error = munge_mode(ip, mode);
return error;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500..70b9b854894 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
* blocks allocated on disk to back that page.
*/
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,8 @@ out_unlock:
gfs2_glock_dq(&gh);
out:
gfs2_holder_uninit(&gh);
+ if (ret)
+ ret = VM_FAULT_SIGBUS;
return ret;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index c8b5acf4b0b..a36bb749926 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -82,6 +82,7 @@ static void hfs_put_super(struct super_block *sb)
static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = HFS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -90,6 +91,8 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = HFS_SB(sb)->fs_ablocks;
buf->f_ffree = HFS_SB(sb)->free_ablocks;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = HFS_NAMELEN;
return 0;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdf..3fcbb0e1f6f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
opts->creator = HFSPLUS_DEF_CR_TYPE;
opts->type = HFSPLUS_DEF_CR_TYPE;
- opts->umask = current->fs->umask;
+ opts->umask = current_umask();
opts->uid = current_uid();
opts->gid = current_gid();
opts->part = -1;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index eb74531a0a8..f2a64020f42 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -223,6 +223,7 @@ static void hfsplus_put_super(struct super_block *sb)
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = HFSPLUS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -231,6 +232,8 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = 0xFFFFFFFF;
buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = HFSPLUS_MAX_STRLEN;
return 0;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c..fecf402d7b8 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -136,6 +136,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *s = dentry->d_sb;
struct hpfs_sb_info *sbi = hpfs_sb(s);
+ u64 id = huge_encode_dev(s->s_bdev->bd_dev);
lock_kernel();
/*if (sbi->sb_n_free == -1) {*/
@@ -149,6 +150,8 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = sbi->sb_n_free;
buf->f_files = sbi->sb_dirband_size / 4;
buf->f_ffree = sbi->sb_n_free_dnodes;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = 254;
unlock_kernel();
@@ -477,7 +480,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
uid = current_uid();
gid = current_gid();
- umask = current->fs->umask;
+ umask = current_umask();
lowercase = 0;
conv = CONV_BINARY;
eas = 2;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b278f7f5202..a5089a6dd67 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -280,7 +280,12 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
"errno = %d\n", err);
return err;
}
- count = hppfs_read_file(hppfs->host_fd, buf, count);
+ err = hppfs_read_file(hppfs->host_fd, buf, count);
+ if (err < 0) {
+ printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
+ return err;
+ }
+ count = err;
if (count > 0)
*ppos += count;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a68..23a3c76711e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -943,14 +943,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
static int can_do_hugetlb_shm(void)
{
- return likely(capable(CAP_IPC_LOCK) ||
- in_group_p(sysctl_hugetlb_shm_group) ||
- can_do_mlock());
+ return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
{
int error = -ENOMEM;
+ int unlock_shm = 0;
struct file *file;
struct inode *inode;
struct dentry *dentry, *root;
@@ -960,11 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
if (!hugetlbfs_vfsmount)
return ERR_PTR(-ENOENT);
- if (!can_do_hugetlb_shm())
- return ERR_PTR(-EPERM);
-
- if (!user_shm_lock(size, user))
- return ERR_PTR(-ENOMEM);
+ if (!can_do_hugetlb_shm()) {
+ if (user_shm_lock(size, user)) {
+ unlock_shm = 1;
+ WARN_ONCE(1,
+ "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+ } else
+ return ERR_PTR(-EPERM);
+ }
root = hugetlbfs_vfsmount->mnt_root;
quick_string.name = name;
@@ -1004,7 +1006,8 @@ out_inode:
out_dentry:
dput(dentry);
out_shm_unlock:
- user_shm_unlock(size, user);
+ if (unlock_shm)
+ user_shm_unlock(size, user);
return ERR_PTR(error);
}
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f173..b4dac4fb6b6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
struct super_block;
struct linux_binprm;
+struct path;
/*
* block_dev.c
@@ -43,7 +44,7 @@ extern void __init chrdev_init(void);
/*
* exec.c
*/
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
/*
* namespace.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
extern void __init mnt_init(void);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 13d2eddd069..b4cbe9603c7 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -923,6 +923,7 @@ out_freesbi:
static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = ISOFS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -932,6 +933,8 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = 0;
buf->f_files = ISOFS_SB(sb)->s_ninodes;
buf->f_ffree = 0;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = NAME_MAX;
return 0;
}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e79c07812af..737f7246a4b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -637,6 +637,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
return NULL;
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ if (!bh)
+ return NULL;
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
set_buffer_uptodate(bh);
@@ -733,9 +735,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
if (!journal->j_wbuf) {
printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
__func__);
- kfree(journal);
- journal = NULL;
- goto out;
+ goto out_err;
}
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
@@ -743,11 +743,19 @@ journal_t * journal_init_dev(struct block_device *bdev,
journal->j_maxlen = len;
bh = __getblk(journal->j_dev, start, journal->j_blocksize);
- J_ASSERT(bh != NULL);
+ if (!bh) {
+ printk(KERN_ERR
+ "%s: Cannot get buffer for journal superblock\n",
+ __func__);
+ goto out_err;
+ }
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
+
return journal;
+out_err:
+ kfree(journal);
+ return NULL;
}
/**
@@ -787,8 +795,7 @@ journal_t * journal_init_inode (struct inode *inode)
if (!journal->j_wbuf) {
printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
__func__);
- kfree(journal);
- return NULL;
+ goto out_err;
}
err = journal_bmap(journal, 0, &blocknr);
@@ -796,16 +803,23 @@ journal_t * journal_init_inode (struct inode *inode)
if (err) {
printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
__func__);
- kfree(journal);
- return NULL;
+ goto out_err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- J_ASSERT(bh != NULL);
+ if (!bh) {
+ printk(KERN_ERR
+ "%s: Cannot get buffer for journal superblock\n",
+ __func__);
+ goto out_err;
+ }
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
return journal;
+out_err:
+ kfree(journal);
+ return NULL;
}
/*
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44..4ea72377c7a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
+ int write_op = WRITE;
/*
* First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
+ if (commit_transaction->t_synchronous_commit)
+ write_op = WRITE_SYNC;
stats.u.run.rs_wait = commit_transaction->t_max_wait;
stats.u.run.rs_locked = jiffies;
stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(WRITE, bh);
+ submit_bh(write_op, bh);
}
cond_resched();
stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff262576..bbe6d592d8b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table. Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
*/
#ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
- *
- * The caller must have the journal locked.
*/
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
*/
-
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction)
{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598..996ffda06bf 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
}
}
+ if (handle->h_sync)
+ transaction->t_synchronous_commit = 1;
current->journal_info = NULL;
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1..77ccf8cb082 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
return PTR_ERR(acl);
if (!acl) {
- *i_mode &= ~current->fs->umask;
+ *i_mode &= ~current_umask();
} else {
if (S_ISDIR(*i_mode))
jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e8..06ca1b8d205 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
cleanup:
posix_acl_release(acl);
} else
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
inode->i_mode;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c..1f3b0fc0d35 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
return 0;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
- struct in6_addr *addr_mapped)
-{
- const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
-
- switch (sap->sa_family) {
- case AF_INET6:
- return &((const struct sockaddr_in6 *)sap)->sin6_addr;
- case AF_INET:
- ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
- return addr_mapped;
- }
-
- return NULL;
-}
-
-/*
- * If lockd is using a PF_INET6 listener, all incoming requests appear
- * to come from AF_INET6 remotes. The address of AF_INET remotes are
- * mapped to AF_INET6 automatically by the network layer. In case the
- * user passed an AF_INET server address at mount time, ensure both
- * addresses are AF_INET6 before comparing them.
- */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
- const struct sockaddr *sap)
-{
- const struct in6_addr *addr1;
- const struct in6_addr *addr2;
- struct in6_addr addr1_mapped;
- struct in6_addr addr2_mapped;
-
- addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
- if (likely(addr1 != NULL)) {
- addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
- if (likely(addr2 != NULL))
- return ipv6_addr_equal(addr1, addr2);
- }
-
- return 0;
-}
-#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
- const struct sockaddr *sap)
-{
- return nlm_cmp_addr(nlm_addr(host), sap);
-}
-#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-
/*
* The server lockd has called us back to tell us the lock was granted
*/
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlmclnt_cmp_addr(block->b_host, addr))
+ if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac82..6d5d4a4169e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
+#include <asm/unaligned.h>
+
#define NLMDBG_FACILITY NLMDBG_MONITOR
#define NSM_PROGRAM 100024
#define NSM_VERSION 1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
{
u64 *p = (u64 *)&nsm->sm_priv.data;
struct timespec ts;
+ s64 ns;
ktime_get_ts(&ts);
- *p++ = timespec_to_ns(&ts);
- *p = (unsigned long)nsm;
+ ns = timespec_to_ns(&ts);
+ put_unaligned(ns, p);
+ put_unaligned((unsigned long)nsm, p + 1);
}
static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b585..abf83881f68 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
- defined(CONFIG_SUNRPC_REGISTER_V4)
-static const sa_family_t nlmsvc_family = AF_INET6;
-#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-static const sa_family_t nlmsvc_family = AF_INET;
-#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-
-/*
* These can be set at insmod time (useful for NFS as root filesystem),
* and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
*/
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
return 0;
}
-static int create_lockd_listener(struct svc_serv *serv, char *name,
- unsigned short port)
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
+ const int family, const unsigned short port)
{
struct svc_xprt *xprt;
- xprt = svc_find_xprt(serv, name, 0, 0);
+ xprt = svc_find_xprt(serv, name, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
-
+ return svc_create_xprt(serv, name, family, port,
+ SVC_SOCK_DEFAULTS);
svc_xprt_put(xprt);
return 0;
}
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+ int err;
+
+ err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+ if (err < 0)
+ return err;
+
+ return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
+
/*
* Ensure there are active UDP and TCP listeners for lockd.
*
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
static int warned;
int err;
- err = create_lockd_listener(serv, "udp", nlm_udpport);
+ err = create_lockd_family(serv, PF_INET);
if (err < 0)
goto out_err;
- err = create_lockd_listener(serv, "tcp", nlm_tcpport);
- if (err < 0)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ err = create_lockd_family(serv, PF_INET6);
+ if (err < 0 && err != -EAFNOSUPPORT)
goto out_err;
+#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
warned = 0;
return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 618865b3128..daad3c2740d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -321,15 +321,20 @@ out:
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
- buf->f_type = dentry->d_sb->s_magic;
- buf->f_bsize = dentry->d_sb->s_blocksize;
+ struct super_block *sb = dentry->d_sb;
+ struct minix_sb_info *sbi = minix_sb(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = sb->s_blocksize;
buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
buf->f_bfree = minix_count_free_blocks(sbi);
buf->f_bavail = buf->f_bfree;
buf->f_files = sbi->s_ninodes;
buf->f_ffree = minix_count_free_inodes(sbi);
buf->f_namelen = sbi->s_namelen;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
+
return 0;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index 16c3ef37eae..680ba60863f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,7 +90,6 @@ struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
-EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -439,7 +438,14 @@ EXPORT_SYMBOL(mpage_readpage);
* just allocate full-size (16-page) BIOs.
*/
-int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+struct mpage_data {
+ struct bio *bio;
+ sector_t last_block_in_bio;
+ get_block_t *get_block;
+ unsigned use_writepage;
+};
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
struct mpage_data *mpd = data;
@@ -648,7 +654,6 @@ out:
mpd->bio = bio;
return ret;
}
-EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785..b8433ebfae0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
#include <asm/uaccess.h>
#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -1578,7 +1579,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
struct dentry *dir = nd->path.dentry;
if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
error = security_path_mknod(&nd->path, path->dentry, mode, 0);
if (error)
goto out_unlock;
@@ -1989,7 +1990,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
goto out_unlock;
}
if (!IS_POSIXACL(nd.path.dentry->d_inode))
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
error = may_mknod(mode);
if (error)
goto out_dput;
@@ -2067,7 +2068,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
goto out_unlock;
if (!IS_POSIXACL(nd.path.dentry->d_inode))
- mode &= ~current->fs->umask;
+ mode &= ~current_umask();
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
@@ -2897,10 +2898,3 @@ EXPORT_SYMBOL(vfs_symlink);
EXPORT_SYMBOL(vfs_unlink);
EXPORT_SYMBOL(dentry_unhash);
EXPORT_SYMBOL(generic_readlink);
-
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
- .count = ATOMIC_INIT(1),
- .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
- .umask = 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index 0a42e0e9602..c6f54e4c429 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
#include <linux/ramfs.h>
#include <linux/log2.h>
#include <linux/idr.h>
+#include <linux/fs_struct.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -2093,66 +2094,6 @@ out1:
}
/*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
- struct path old_root;
-
- write_lock(&fs->lock);
- old_root = fs->root;
- fs->root = *path;
- path_get(path);
- write_unlock(&fs->lock);
- if (old_root.dentry)
- path_put(&old_root);
-}
-
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
- struct path old_pwd;
-
- write_lock(&fs->lock);
- old_pwd = fs->pwd;
- fs->pwd = *path;
- path_get(path);
- write_unlock(&fs->lock);
-
- if (old_pwd.dentry)
- path_put(&old_pwd);
-}
-
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
- struct task_struct *g, *p;
- struct fs_struct *fs;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- task_lock(p);
- fs = p->fs;
- if (fs) {
- atomic_inc(&fs->count);
- task_unlock(p);
- if (fs->root.dentry == old_root->dentry
- && fs->root.mnt == old_root->mnt)
- set_fs_root(fs, new_root);
- if (fs->pwd.dentry == old_root->dentry
- && fs->pwd.mnt == old_root->mnt)
- set_fs_pwd(fs, new_root);
- put_fs_struct(fs);
- } else
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
-}
-
-/*
* pivot_root Semantics:
* Moves the root file system of the current process to the directory put_old,
* makes new_root as the new root file system of the current process, and sets
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 36fe20d6eba..e67f3ec0773 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -84,3 +84,11 @@ config ROOT_NFS
<file:Documentation/filesystems/nfsroot.txt>.
Most people say N here.
+
+config NFS_FSCACHE
+ bool "Provide NFS client caching support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+ help
+ Say Y here if you want NFS data to be cached locally on disc through
+ the general filesystem cache manager
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ac6170c594a..845159814de 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
callback.o callback_xdr.o callback_proc.o \
nfs4namespace.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a108..a886e692ddd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
unsigned int nfs_callback_set_tcpport;
unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
static const int nfs_set_port_min = 0;
static const int nfs_set_port_max = 65535;
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const sa_family_t nfs_callback_family = AF_INET6;
-#else
-static const sa_family_t nfs_callback_family = AF_INET;
-#endif
-
static int param_set_port(const char *val, struct kernel_param *kp)
{
char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
- nfs_callback_family, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
- ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
- SVC_SOCK_ANONYMOUS);
+ ret = svc_create_xprt(serv, "tcp", PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret <= 0)
goto out_err;
nfs_callback_tcpport = ret;
dprintk("NFS: Callback listener port = %u (af %u)\n",
- nfs_callback_tcpport, nfs_callback_family);
+ nfs_callback_tcpport, PF_INET);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ ret = svc_create_xprt(serv, "tcp", PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+ if (ret > 0) {
+ nfs_callback_tcpport6 = ret;
+ dprintk("NFS: Callback listener port = %u (af %u)\n",
+ nfs_callback_tcpport6, PF_INET6);
+ } else if (ret != -EAFNOSUPPORT)
+ goto out_err;
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff..e110e286a26 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
extern unsigned int nfs_callback_set_tcpport;
extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e..75c9cd2aa11 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -45,6 +45,7 @@
#include "delegation.h"
#include "iostat.h"
#include "internal.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_CLIENT
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
if (!IS_ERR(cred))
clp->cl_machine_cred = cred;
+ nfs_fscache_get_client_cookie(clp);
+
return clp;
error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
nfs4_shutdown_client(clp);
+ nfs_fscache_release_client_cookie(clp);
+
/* -EIO all pending I/O */
if (!IS_ERR(clp->cl_rpcclient))
rpc_shutdown_client(clp->cl_rpcclient);
@@ -224,38 +229,6 @@ void nfs_put_client(struct nfs_client *clp)
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
-{
- switch (sa->sa_family) {
- default:
- return NULL;
- case AF_INET6:
- return &((const struct sockaddr_in6 *)sa)->sin6_addr;
- break;
- case AF_INET:
- ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
- addr_mapped);
- return addr_mapped;
- }
-}
-
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct in6_addr *addr1;
- const struct in6_addr *addr2;
- struct in6_addr addr1_mapped;
- struct in6_addr addr2_mapped;
-
- addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
- if (likely(addr1 != NULL)) {
- addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
- if (likely(addr2 != NULL))
- return ipv6_addr_equal(addr1, addr2);
- }
- return 0;
-}
-
/*
* Test if two ip6 socket addresses refer to the same socket by
* comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +240,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
*
* The caller should ensure both socket addresses are AF_INET6.
*/
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
{
- const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
- if (!ipv6_addr_equal(&saddr1->sin6_addr,
- &saddr1->sin6_addr))
- return 0;
- if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
- saddr1->sin6_scope_id != saddr2->sin6_scope_id)
+ if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+ sin1->sin6_scope_id != sin2->sin6_scope_id)
return 0;
- return saddr1->sin6_port == saddr2->sin6_port;
-}
-#else
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
- const struct sockaddr_in *sa2)
-{
- return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
-}
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
- return 0;
- return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
- (const struct sockaddr_in *)sa2);
+ return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
- const struct sockaddr * sa2)
+#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
{
return 0;
}
@@ -311,20 +267,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
*
* The caller should ensure both socket addresses are AF_INET.
*/
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+ return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+ return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+ (sin1->sin6_port == sin2->sin6_port);
+}
+
static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
const struct sockaddr *sa2)
{
- const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
- if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+ return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+ (sin1->sin_port == sin2->sin_port);
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ if (sa1->sa_family != sa2->sa_family)
return 0;
- return saddr1->sin_port == saddr2->sin_port;
+
+ switch (sa1->sa_family) {
+ case AF_INET:
+ return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+ case AF_INET6:
+ return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+ }
+ return 0;
}
/*
* Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields.
+ * by comparing (only) relevant fields, including the port number.
*/
static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
const struct sockaddr *sa2)
@@ -772,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
+ server->options = data->options;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1160,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->options = data->options;
/* Get a client record */
error = nfs4_set_client(server,
@@ -1571,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
/* display header on line 1 */
if (v == &nfs_volume_list) {
- seq_puts(m, "NV SERVER PORT DEV FSID\n");
+ seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
return 0;
}
/* display one transport per line on subsequent lines */
@@ -1585,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
- seq_printf(m, "v%u %s %s %-7s %-17s\n",
+ seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
dev,
- fsid);
+ fsid,
+ nfs_server_fscache_state(server));
return 0;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db..370b190a09d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
} else if (atomic_read(&new_dentry->d_count) > 1)
/* dentry still busy? */
goto out;
- } else
- nfs_drop_nlink(new_inode);
+ }
go_ahead:
/*
@@ -1638,10 +1637,8 @@ go_ahead:
}
nfs_inode_return_delegation(old_inode);
- if (new_inode != NULL) {
+ if (new_inode != NULL)
nfs_inode_return_delegation(new_inode);
- d_delete(new_dentry);
- }
error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
if (rehash)
d_rehash(rehash);
if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
d_move(old_dentry, new_dentry);
nfs_set_verifier(new_dentry,
nfs_save_change_attribute(new_dir));
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d..3523b895eb4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -35,6 +35,7 @@
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_FILE
@@ -64,11 +65,7 @@ const struct file_operations nfs_file_operations = {
.write = do_sync_write,
.aio_read = nfs_file_read,
.aio_write = nfs_file_write,
-#ifdef CONFIG_MMU
.mmap = nfs_file_mmap,
-#else
- .mmap = generic_file_mmap,
-#endif
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
@@ -141,9 +138,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
dentry->d_parent->d_name.name,
dentry->d_name.name);
- /* Ensure that dirty pages are flushed out with the right creds */
- if (filp->f_mode & FMODE_WRITE)
- nfs_wb_all(dentry->d_inode);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
return nfs_release(inode, filp);
}
@@ -235,7 +229,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
- int status;
dprintk("NFS: flush(%s/%s)\n",
dentry->d_parent->d_name.name,
@@ -245,11 +238,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
return 0;
nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
- /* Ensure that data+attribute caches are up to date after close() */
- status = nfs_do_fsync(ctx, inode);
- if (!status)
- nfs_revalidate_inode(NFS_SERVER(inode), inode);
- return status;
+ /* Flush writes to the server and return any errors */
+ return nfs_do_fsync(ctx, inode);
}
static ssize_t
@@ -304,11 +294,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
dprintk("NFS: mmap(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
- status = nfs_revalidate_mapping(inode, file->f_mapping);
+ /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ * so we call that before revalidating the mapping
+ */
+ status = generic_file_mmap(file, vma);
if (!status) {
vma->vm_ops = &nfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
- file_accessed(file);
+ status = nfs_revalidate_mapping(inode, file->f_mapping);
}
return status;
}
@@ -354,6 +346,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
+ /*
+ * Prevent starvation issues if someone is doing a consistency
+ * sync-to-disk
+ */
+ ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ return ret;
+
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
return copied;
}
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ * page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
static void nfs_invalidate_page(struct page *page, unsigned long offset)
{
dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page->mapping->host, page);
+
+ nfs_fscache_invalidate_page(page, page->mapping->host);
}
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
static int nfs_release_page(struct page *page, gfp_t gfp)
{
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* If PagePrivate() is set, then the page is not freeable */
- return 0;
+ if (PagePrivate(page))
+ return 0;
+ return nfs_fscache_release_page(page, gfp);
}
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
static int nfs_launder_page(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
inode->i_ino, (long long)page_offset(page));
+ nfs_fscache_wait_on_page_write(nfsi, page);
return nfs_wb_page(inode, page);
}
@@ -451,8 +479,14 @@ const struct address_space_operations nfs_file_aops = {
.launder_page = nfs_launder_page,
};
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
struct file *filp = vma->vm_file;
struct dentry *dentry = filp->f_path.dentry;
unsigned pagelen;
@@ -464,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
filp->f_mapping->host->i_ino,
(long long)page_offset(page));
+ /* make sure the cache has finished storing the page */
+ nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+
lock_page(page);
mapping = page->mapping;
if (mapping != dentry->d_inode->i_mapping)
@@ -483,6 +520,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
ret = pagelen;
out_unlock:
unlock_page(page);
+ if (ret)
+ ret = VM_FAULT_SIGBUS;
return ret;
}
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 00000000000..5b1006480bc
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here. The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+ .name = "nfs",
+ .version = 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+ return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+ fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+ uint16_t nfsversion; /* NFS protocol version */
+ uint16_t family; /* address family */
+ uint16_t port; /* IP port */
+ union {
+ struct in_addr ipv4_addr; /* IPv4 address */
+ struct in6_addr ipv6_addr; /* IPv6 address */
+ } addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct nfs_client *clp = cookie_netfs_data;
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+ const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+ struct nfs_server_key *key = buffer;
+ uint16_t len = sizeof(struct nfs_server_key);
+
+ key->nfsversion = clp->rpc_ops->version;
+ key->family = clp->cl_addr.ss_family;
+
+ memset(key, 0, len);
+
+ switch (clp->cl_addr.ss_family) {
+ case AF_INET:
+ key->port = sin->sin_port;
+ key->addr[0].ipv4_addr = sin->sin_addr;
+ len += sizeof(key->addr[0].ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key->port = sin6->sin6_port;
+ key->addr[0].ipv6_addr = sin6->sin6_addr;
+ len += sizeof(key->addr[0].ipv6_addr);
+ break;
+
+ default:
+ printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+ clp->cl_addr.ss_family);
+ len = 0;
+ break;
+ }
+
+ return len;
+}
+
+/*
+ * Define the server object for FS-Cache. This is used to describe a server
+ * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+ .name = "NFS.server",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = nfs_server_get_key,
+};
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct nfs_fscache_key *key;
+ const struct nfs_server *nfss = cookie_netfs_data;
+ uint16_t len;
+
+ key = nfss->fscache_key;
+ len = sizeof(key->key) + key->key.uniq_len;
+ if (len > bufmax) {
+ len = 0;
+ } else {
+ memcpy(buffer, &key->key, sizeof(key->key));
+ memcpy(buffer + sizeof(key->key),
+ key->key.uniquifier, key->key.uniq_len);
+ }
+
+ return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache. This is used to describe a
+ * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+ .name = "NFS.super",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = nfs_super_get_key,
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode. This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+ struct timespec mtime;
+ struct timespec ctime;
+ loff_t size;
+ u64 change_attr;
+};
+
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct nfs_inode *nfsi = cookie_netfs_data;
+ uint16_t nsize;
+
+ /* use the inode's NFS filehandle as the key */
+ nsize = nfsi->fh.size;
+ memcpy(buffer, nfsi->fh.data, nsize);
+ return nsize;
+}
+
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
+{
+ const struct nfs_inode *nfsi = cookie_netfs_data;
+
+ *size = nfsi->vfs_inode.i_size;
+}
+
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ const struct nfs_inode *nfsi = cookie_netfs_data;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.size = nfsi->vfs_inode.i_size;
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = nfsi->change_attr;
+
+ if (bufmax > sizeof(auxdata))
+ bufmax = sizeof(auxdata);
+
+ memcpy(buffer, &auxdata, bufmax);
+ return bufmax;
+}
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ * presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_inode *nfsi = cookie_netfs_data;
+
+ if (datalen != sizeof(auxdata))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.size = nfsi->vfs_inode.i_size;
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = nfsi->change_attr;
+
+ if (memcmp(data, &auxdata, datalen) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ * is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+ struct nfs_inode *nfsi = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+
+ for (;;) {
+ /* grab a bunch of pages to unmark */
+ nr_pages = pagevec_lookup(&pvec,
+ nfsi->vfs_inode.i_mapping,
+ first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ * context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ * cache fails with EIO - in which case the server must be contacted to
+ * retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+ get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ * context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+ if (context)
+ put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache. This is used to describe an inode
+ * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+ .name = "NFS.fh",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = nfs_fscache_inode_get_key,
+ .get_attr = nfs_fscache_inode_get_attr,
+ .get_aux = nfs_fscache_inode_get_aux,
+ .check_aux = nfs_fscache_inode_check_aux,
+ .now_uncached = nfs_fscache_inode_now_uncached,
+ .get_context = nfs_fh_get_context,
+ .put_context = nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 00000000000..379be678cb7
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,523 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ * cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+ /* create a cache index for looking up filehandles */
+ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+ &nfs_fscache_server_index_def,
+ clp);
+ dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+ clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+ dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+ clp, clp->fscache);
+
+ fscache_relinquish_cookie(clp->fscache, 0);
+ clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock. We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+ struct nfs_parsed_mount_data *data)
+{
+ struct nfs_fscache_key *key, *xkey;
+ struct nfs_server *nfss = NFS_SB(sb);
+ struct rb_node **p, *parent;
+ const char *uniq = data->fscache_uniq ?: "";
+ int diff, ulen;
+
+ ulen = strlen(uniq);
+ key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+ if (!key)
+ return;
+
+ key->nfs_client = nfss->nfs_client;
+ key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+ key->key.nfs_server.flags = nfss->flags;
+ key->key.nfs_server.rsize = nfss->rsize;
+ key->key.nfs_server.wsize = nfss->wsize;
+ key->key.nfs_server.acregmin = nfss->acregmin;
+ key->key.nfs_server.acregmax = nfss->acregmax;
+ key->key.nfs_server.acdirmin = nfss->acdirmin;
+ key->key.nfs_server.acdirmax = nfss->acdirmax;
+ key->key.nfs_server.fsid = nfss->fsid;
+ key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+ key->key.uniq_len = ulen;
+ memcpy(key->key.uniquifier, uniq, ulen);
+
+ spin_lock(&nfs_fscache_keys_lock);
+ p = &nfs_fscache_keys.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+ if (key->nfs_client < xkey->nfs_client)
+ goto go_left;
+ if (key->nfs_client > xkey->nfs_client)
+ goto go_right;
+
+ diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+ if (diff < 0)
+ goto go_left;
+ if (diff > 0)
+ goto go_right;
+
+ if (key->key.uniq_len == 0)
+ goto non_unique;
+ diff = memcmp(key->key.uniquifier,
+ xkey->key.uniquifier,
+ key->key.uniq_len);
+ if (diff < 0)
+ goto go_left;
+ if (diff > 0)
+ goto go_right;
+ goto non_unique;
+
+ go_left:
+ p = &(*p)->rb_left;
+ continue;
+ go_right:
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&key->node, parent, p);
+ rb_insert_color(&key->node, &nfs_fscache_keys);
+ spin_unlock(&nfs_fscache_keys_lock);
+ nfss->fscache_key = key;
+
+ /* create a cache index for looking up filehandles */
+ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+ &nfs_fscache_super_index_def,
+ nfss);
+ dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+ nfss, nfss->fscache);
+ return;
+
+non_unique:
+ spin_unlock(&nfs_fscache_keys_lock);
+ kfree(key);
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+ printk(KERN_WARNING "NFS:"
+ " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+ struct nfs_server *nfss = NFS_SB(sb);
+
+ dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+ nfss, nfss->fscache);
+
+ fscache_relinquish_cookie(nfss->fscache, 0);
+ nfss->fscache = NULL;
+
+ if (nfss->fscache_key) {
+ spin_lock(&nfs_fscache_keys_lock);
+ rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+ spin_unlock(&nfs_fscache_keys_lock);
+ kfree(nfss->fscache_key);
+ nfss->fscache_key = NULL;
+ }
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+ NFS_I(inode)->fscache = NULL;
+ if (S_ISREG(inode->i_mode))
+ set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfsi->fscache || !NFS_FSCACHE(inode))
+ return;
+
+ if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+ nfsi->fscache = fscache_acquire_cookie(
+ NFS_SB(sb)->fscache,
+ &nfs_fscache_inode_object_def,
+ nfsi);
+
+ dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+ sb, nfsi, nfsi->fscache);
+ }
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+ nfsi, nfsi->fscache);
+
+ fscache_relinquish_cookie(nfsi->fscache, 0);
+ nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+ nfsi, nfsi->fscache);
+
+ fscache_relinquish_cookie(nfsi->fscache, 1);
+ nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+ clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+ if (NFS_I(inode)->fscache) {
+ dfprintk(FSCACHE,
+ "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+ /* Need to invalidate any mapped pages that were read in before
+ * turning off the cache.
+ */
+ if (inode->i_mapping && inode->i_mapping->nrpages)
+ invalidate_inode_pages2(inode->i_mapping);
+
+ nfs_fscache_zap_inode_cookie(inode);
+ }
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+ schedule();
+ return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+ wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+ nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ smp_mb__before_clear_bit();
+ clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+ if (NFS_FSCACHE(inode)) {
+ nfs_fscache_inode_lock(inode);
+ if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+ nfs_fscache_disable_inode_cookie(inode);
+ else
+ nfs_fscache_enable_inode_cookie(inode);
+ nfs_fscache_inode_unlock(inode);
+ }
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ struct fscache_cookie *old = nfsi->fscache;
+
+ nfs_fscache_inode_lock(inode);
+ if (nfsi->fscache) {
+ /* retire the current fscache cache and get a new one */
+ fscache_relinquish_cookie(nfsi->fscache, 1);
+
+ nfsi->fscache = fscache_acquire_cookie(
+ nfss->nfs_client->fscache,
+ &nfs_fscache_inode_object_def,
+ nfsi);
+
+ dfprintk(FSCACHE,
+ "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+ nfss, nfsi, old, nfsi->fscache);
+ }
+ nfs_fscache_inode_unlock(inode);
+}
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+ struct fscache_cookie *cookie = nfsi->fscache;
+
+ BUG_ON(!cookie);
+
+ if (fscache_check_page_write(cookie, page)) {
+ if (!(gfp & __GFP_WAIT))
+ return 0;
+ fscache_wait_on_page_write(cookie, page);
+ }
+
+ if (PageFsCache(page)) {
+ dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+ cookie, page, nfsi);
+
+ fscache_uncache_page(cookie, page);
+ nfs_add_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ }
+
+ return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct fscache_cookie *cookie = nfsi->fscache;
+
+ BUG_ON(!cookie);
+
+ dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+ cookie, page, nfsi);
+
+ fscache_wait_on_page_write(cookie, page);
+
+ BUG_ON(!PageLocked(page));
+ fscache_uncache_page(cookie, page);
+ nfs_add_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+ void *context,
+ int error)
+{
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+ page, context, error);
+
+ /* if the read completes with an error, we just unlock the page and let
+ * the VM reissue the readpage */
+ if (!error) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ } else {
+ error = nfs_readpage_async(context, page->mapping->host, page);
+ if (error)
+ unlock_page(page);
+ }
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode, struct page *page)
+{
+ int ret;
+
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+ NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+
+ ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+ page,
+ nfs_readpage_from_fscache_complete,
+ ctx,
+ GFP_KERNEL);
+
+ switch (ret) {
+ case 0: /* read BIO submitted (page in fscache) */
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache: BIO submitted\n");
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+ return ret;
+
+ case -ENOBUFS: /* inode not in cache */
+ case -ENODATA: /* page not in cache */
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache %d\n", ret);
+ return 1;
+
+ default:
+ dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+ }
+ return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ int ret, npages = *nr_pages;
+
+ dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+ NFS_I(inode)->fscache, npages, inode);
+
+ ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+ mapping, pages, nr_pages,
+ nfs_readpage_from_fscache_complete,
+ ctx,
+ mapping_gfp_mask(mapping));
+ if (*nr_pages < npages)
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+ npages);
+ if (*nr_pages > 0)
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+ *nr_pages);
+
+ switch (ret) {
+ case 0: /* read submitted to the cache for all pages */
+ BUG_ON(!list_empty(pages));
+ BUG_ON(*nr_pages != 0);
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: submitted\n");
+
+ return ret;
+
+ case -ENOBUFS: /* some pages aren't cached and can't be */
+ case -ENODATA: /* some pages aren't cached */
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+ return 1;
+
+ default:
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
+ }
+
+ return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+ int ret;
+
+ dfprintk(FSCACHE,
+ "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+ NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+
+ ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+ dfprintk(FSCACHE,
+ "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+ page, page->index, page->flags, ret);
+
+ if (ret != 0) {
+ fscache_uncache_page(NFS_I(inode)->fscache, page);
+ nfs_add_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ } else {
+ nfs_add_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+ }
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 00000000000..6e809bb0ff0
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,220 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+ struct rb_node node;
+ struct nfs_client *nfs_client; /* the server */
+
+ /* the elements of the unique key - as used by nfs_compare_super() and
+ * nfs_compare_mount_options() to distinguish superblocks */
+ struct {
+ struct {
+ unsigned long s_flags; /* various flags
+ * (& NFS_MS_MASK) */
+ } super;
+
+ struct {
+ struct nfs_fsid fsid;
+ int flags;
+ unsigned int rsize; /* read size */
+ unsigned int wsize; /* write size */
+ unsigned int acregmin; /* attr cache timeouts */
+ unsigned int acregmax;
+ unsigned int acdirmin;
+ unsigned int acdirmax;
+ } nfs_server;
+
+ struct {
+ rpc_authflavor_t au_flavor;
+ } rpc_auth;
+
+ /* uniquifier - can be used if nfs_server.flags includes
+ * NFS_MOUNT_UNSHARED */
+ u8 uniq_len;
+ char uniquifier[0];
+ } key;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+ struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+ struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+ struct inode *, struct address_space *,
+ struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+ struct inode *inode)
+{
+ if (PageFsCache(page))
+ __nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page)
+{
+ if (NFS_I(inode)->fscache)
+ return __nfs_readpage_from_fscache(ctx, inode, page);
+ return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ if (NFS_I(inode)->fscache)
+ return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+ nr_pages);
+ return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+ struct page *page,
+ int sync)
+{
+ if (PageFsCache(page))
+ __nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+ return "yes";
+ return "no ";
+}
+
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_get_super_cookie(
+ struct super_block *sb,
+ struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+ struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+ struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+ struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+ struct page *page, int sync) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f2..46177cb8706 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
return ret;
}
- if (fattr.type != NFDIR) {
+ if (!S_ISDIR(fattr.mode)) {
printk(KERN_ERR "nfs4_get_root:"
" getroot encountered non-directory\n");
return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
return ret;
}
- if (fattr.type != NFDIR) {
+ if (!S_ISDIR(fattr.mode)) {
printk(KERN_ERR "nfs4_get_root:"
" lookupfh encountered non-directory\n");
return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171..64f87194d39 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
#include "delegation.h"
#include "iostat.h"
#include "internal.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -66,6 +67,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
}
/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ schedule();
+ return 0;
+}
+
+/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
*
@@ -109,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
BUG_ON(!list_empty(&NFS_I(inode)->open_files));
nfs_zap_acl_cache(inode);
nfs_access_zap_cache(inode);
+ nfs_fscache_release_inode_cookie(inode);
}
/**
@@ -249,13 +263,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
struct inode *inode = ERR_PTR(-ENOENT);
unsigned long hash;
- if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
goto out_no_inode;
-
- if (!fattr->nlink) {
- printk("NFS: Buggy server - nlink == 0!\n");
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
goto out_no_inode;
- }
hash = nfs_fattr_to_ino_t(fattr);
@@ -291,7 +302,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
&& fattr->size <= NFS_LIMIT_READDIRPLUS)
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
/* Deal with crossing mountpoints */
- if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+ if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+ && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
inode->i_op = &nfs_referral_inode_operations;
else
@@ -304,30 +316,49 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
else
init_special_inode(inode, inode->i_mode, fattr->rdev);
+ memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+ memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+ nfsi->change_attr = 0;
+ inode->i_size = 0;
+ inode->i_nlink = 0;
+ inode->i_uid = -2;
+ inode->i_gid = -2;
+ inode->i_blocks = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
nfsi->read_cache_jiffies = fattr->time_start;
nfsi->attr_gencount = fattr->gencount;
- inode->i_atime = fattr->atime;
- inode->i_mtime = fattr->mtime;
- inode->i_ctime = fattr->ctime;
- if (fattr->valid & NFS_ATTR_FATTR_V4)
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
nfsi->change_attr = fattr->change_attr;
- inode->i_size = nfs_size_to_loff_t(fattr->size);
- inode->i_nlink = fattr->nlink;
- inode->i_uid = fattr->uid;
- inode->i_gid = fattr->gid;
- if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ inode->i_size = nfs_size_to_loff_t(fattr->size);
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+ inode->i_nlink = fattr->nlink;
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+ inode->i_uid = fattr->uid;
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+ inode->i_gid = fattr->gid;
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- } else {
- inode->i_blocks = fattr->du.nfs2.blocks;
}
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
- memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
nfsi->access_cache = RB_ROOT;
+ nfs_fscache_init_inode_cookie(inode);
+
unlock_new_inode(inode);
} else
nfs_refresh_inode(inode, fattr);
@@ -514,6 +545,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
return err;
}
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct inode *inode;
+ struct nfs_server *server;
+
+ if (!(ctx->mode & FMODE_WRITE))
+ return;
+ if (!is_sync)
+ return;
+ inode = ctx->path.dentry->d_inode;
+ if (!list_empty(&NFS_I(inode)->open_files))
+ return;
+ server = NFS_SERVER(inode);
+ if (server->flags & NFS_MOUNT_NOCTO)
+ return;
+ nfs_revalidate_inode(server, inode);
+}
+
static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
{
struct nfs_open_context *ctx;
@@ -540,24 +597,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
return ctx;
}
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
{
- struct inode *inode;
-
- if (ctx == NULL)
- return;
+ struct inode *inode = ctx->path.dentry->d_inode;
- inode = ctx->path.dentry->d_inode;
if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
return;
list_del(&ctx->list);
spin_unlock(&inode->i_lock);
- if (ctx->state != NULL) {
- if (wait)
- nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
- else
- nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
- }
+ NFS_PROTO(inode)->close_context(ctx, is_sync);
if (ctx->cred != NULL)
put_rpccred(ctx->cred);
path_put(&ctx->path);
@@ -642,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
ctx->mode = filp->f_mode;
nfs_file_set_open_context(filp, ctx);
put_nfs_open_context(ctx);
+ nfs_fscache_set_inode_cookie(inode, filp);
return 0;
}
@@ -670,9 +719,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode))
goto out;
- if (NFS_STALE(inode))
- goto out;
-
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
if (status != 0) {
@@ -745,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
spin_unlock(&inode->i_lock);
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+ nfs_fscache_reset_inode_cookie(inode);
dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
return 0;
@@ -815,25 +862,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
struct nfs_inode *nfsi = NFS_I(inode);
- if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
- nfsi->change_attr == fattr->pre_change_attr) {
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+ && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ && nfsi->change_attr == fattr->pre_change_attr) {
nfsi->change_attr = fattr->change_attr;
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
}
/* If we have atomic WCC data, we may update some attributes */
- if ((fattr->valid & NFS_ATTR_WCC) != 0) {
- if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+
+ if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
- }
- if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
- nfsi->npages == 0)
- i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
+ if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+ && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+ && nfsi->npages == 0)
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
/**
@@ -853,35 +906,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
/* Has the inode gone and changed behind our back? */
- if (nfsi->fileid != fattr->fileid
- || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+ return -EIO;
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
return -EIO;
- }
- if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
nfsi->change_attr != fattr->change_attr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
/* Verify a few of the more important attributes */
- if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- cur_size = i_size_read(inode);
- new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize && nfsi->npages == 0)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize && nfsi->npages == 0)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ }
/* Have any file permissions changed? */
- if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
- || inode->i_uid != fattr->uid
- || inode->i_gid != fattr->gid)
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
/* Has the link count changed? */
- if (inode->i_nlink != fattr->nlink)
+ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
invalid |= NFS_INO_INVALID_ATTR;
- if (!timespec_equal(&inode->i_atime, &fattr->atime))
+ if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
invalid |= NFS_INO_INVALID_ATIME;
if (invalid != 0)
@@ -893,11 +950,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
{
+ if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+ return 0;
return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
}
static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
{
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ return 0;
return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
}
@@ -975,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
spin_lock(&inode->i_lock);
status = nfs_refresh_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
+
return status;
}
@@ -1033,20 +1095,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
/* Don't do a WCC update if these attributes are already stale */
if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
!nfs_inode_attrs_need_update(inode, fattr)) {
- fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+ fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME);
goto out_noforce;
}
- if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
- (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
fattr->pre_change_attr = NFS_I(inode)->change_attr;
- fattr->valid |= NFS_ATTR_WCC_V4;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
}
- if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
- (fattr->valid & NFS_ATTR_WCC) == 0) {
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+ fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+ fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
fattr->pre_size = i_size_read(inode);
- fattr->valid |= NFS_ATTR_WCC;
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
}
out_noforce:
status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1151,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
__func__, inode->i_sb->s_id, inode->i_ino,
atomic_read(&inode->i_count), fattr->valid);
- if (nfsi->fileid != fattr->fileid)
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
goto out_fileid;
/*
* Make sure the inode's type hasn't changed.
*/
- if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
goto out_changed;
server = NFS_SERVER(inode);
/* Update the fsid? */
- if (S_ISDIR(inode->i_mode) &&
+ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
server->fsid = fattr->fsid;
@@ -1099,14 +1172,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
nfsi->read_cache_jiffies = fattr->time_start;
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_PAGECACHE);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
/* More cache consistency checks */
- if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (nfsi->change_attr != fattr->change_attr) {
+ dprintk("NFS: change_attr change on server for file %s/%ld\n",
+ inode->i_sb->s_id, inode->i_ino);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ nfsi->change_attr = fattr->change_attr;
+ }
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
/* NFSv2/v3: Check if the mtime agrees */
if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1200,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
+ memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
}
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
/* If ctime has changed we should definitely clear access+acl caches */
- if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
- } else if (nfsi->change_attr != fattr->change_attr) {
- dprintk("NFS: change_attr change on server for file %s/%ld\n",
- inode->i_sb->s_id, inode->i_ino);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
- if (S_ISDIR(inode->i_mode))
- nfs_force_lookup_revalidate(inode);
+ /* and probably clear data for a directory too as utimes can cause
+ * havoc with our cache.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ invalid |= NFS_INO_INVALID_DATA;
+ nfs_force_lookup_revalidate(inode);
+ }
+ memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+ }
}
/* Check if our cached file size is stale */
- new_isize = nfs_size_to_loff_t(fattr->size);
- cur_isize = i_size_read(inode);
- if (new_isize != cur_isize) {
- /* Do we perhaps have any outstanding writes, or has
- * the file grown beyond our last write? */
- if (nfsi->npages == 0 || new_isize > cur_isize) {
- i_size_write(inode, new_isize);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ cur_isize = i_size_read(inode);
+ if (new_isize != cur_isize) {
+ /* Do we perhaps have any outstanding writes, or has
+ * the file grown beyond our last write? */
+ if (nfsi->npages == 0 || new_isize > cur_isize) {
+ i_size_write(inode, new_isize);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ }
+ dprintk("NFS: isize change on server for file %s/%ld\n",
+ inode->i_sb->s_id, inode->i_ino);
}
- dprintk("NFS: isize change on server for file %s/%ld\n",
- inode->i_sb->s_id, inode->i_ino);
}
- memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- nfsi->change_attr = fattr->change_attr;
-
- if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
- inode->i_uid != fattr->uid ||
- inode->i_gid != fattr->gid)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- if (inode->i_nlink != fattr->nlink)
- invalid |= NFS_INO_INVALID_ATTR;
+ if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_mode = fattr->mode;
+ }
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+ if (inode->i_uid != fattr->uid) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_uid = fattr->uid;
+ }
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+ if (inode->i_gid != fattr->gid) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_gid = fattr->gid;
+ }
+ }
- inode->i_mode = fattr->mode;
- inode->i_nlink = fattr->nlink;
- inode->i_uid = fattr->uid;
- inode->i_gid = fattr->gid;
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+ if (inode->i_nlink != fattr->nlink) {
+ invalid |= NFS_INO_INVALID_ATTR;
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ inode->i_nlink = fattr->nlink;
+ }
+ }
- if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- } else {
- inode->i_blocks = fattr->du.nfs2.blocks;
}
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1381,6 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
- nfsi->ncommit = 0;
nfsi->npages = 0;
atomic_set(&nfsi->silly_count, 1);
INIT_HLIST_HEAD(&nfsi->silly_list);
@@ -1337,6 +1443,10 @@ static int __init init_nfs_fs(void)
{
int err;
+ err = nfs_fscache_register();
+ if (err < 0)
+ goto out7;
+
err = nfsiod_start();
if (err)
goto out6;
@@ -1389,6 +1499,8 @@ out4:
out5:
nfsiod_stop();
out6:
+ nfs_fscache_unregister();
+out7:
return err;
}
@@ -1399,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_readpagecache();
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
+ nfs_fscache_unregister();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister("nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608..e4d6a8348ad 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -5,6 +5,8 @@
#include <linux/mount.h>
#include <linux/security.h>
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
struct nfs_string;
/* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
int acregmin, acregmax,
acdirmin, acdirmax;
int namlen;
+ unsigned int options;
unsigned int bsize;
unsigned int auth_flavor_len;
rpc_authflavor_t auth_flavors[1];
char *client_address;
+ char *fscache_uniq;
struct {
struct sockaddr_storage address;
@@ -152,6 +156,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
extern struct rpc_procinfo nfs4_procedures[];
#endif
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
/* dir.c */
extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
@@ -165,6 +172,7 @@ extern void nfs_clear_inode(struct inode *);
extern void nfs4_clear_inode(struct inode *);
#endif
void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
/* super.c */
void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a3695281003..a2ab2529b5c 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -16,6 +16,9 @@
struct nfs_iostats {
unsigned long long bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+ unsigned long long fscache[__NFSIOS_FSCACHEMAX];
+#endif
unsigned long events[__NFSIOS_COUNTSMAX];
} ____cacheline_aligned;
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+ enum nfs_stat_fscachecounters stat,
+ unsigned long addend)
+{
+ struct nfs_iostats *iostats;
+ int cpu;
+
+ cpu = get_cpu();
+ iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+ iostats->fscache[stat] += addend;
+ put_cpu_no_resched();
+}
+#endif
+
static inline struct nfs_iostats *nfs_alloc_iostats(void)
{
return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d151..c862c9340f9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
static __be32 *
xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
{
- u32 rdev;
- fattr->type = (enum nfs_ftype) ntohl(*p++);
+ u32 rdev, type;
+ type = ntohl(*p++);
fattr->mode = ntohl(*p++);
fattr->nlink = ntohl(*p++);
fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
p = xdr_decode_time(p, &fattr->atime);
p = xdr_decode_time(p, &fattr->mtime);
p = xdr_decode_time(p, &fattr->ctime);
- fattr->valid |= NFS_ATTR_FATTR;
+ fattr->valid |= NFS_ATTR_FATTR_V2;
fattr->rdev = new_decode_dev(rdev);
- if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
- fattr->type = NFFIFO;
+ if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
fattr->rdev = 0;
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679..d0cc5ce0edf 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
data->arg.create.verifier[1] = current->pid;
}
- sattr->ia_mode &= ~current->fs->umask;
+ sattr->ia_mode &= ~current_umask();
for (;;) {
status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
- sattr->ia_mode &= ~current->fs->umask;
+ sattr->ia_mode &= ~current_umask();
data = nfs3_alloc_createdata();
if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
MAJOR(rdev), MINOR(rdev));
- sattr->ia_mode &= ~current->fs->umask;
+ sattr->ia_mode &= ~current_umask();
data = nfs3_alloc_createdata();
if (data == NULL)
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.commit_done = nfs3_commit_done,
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
+ .close_context = nfs_close_context,
};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde4..e6a1932c711 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
/*
* Map file type to S_IFMT bits
*/
-static struct {
- unsigned int mode;
- unsigned int nfs2type;
-} nfs_type2fmt[] = {
- { 0, NFNON },
- { S_IFREG, NFREG },
- { S_IFDIR, NFDIR },
- { S_IFBLK, NFBLK },
- { S_IFCHR, NFCHR },
- { S_IFLNK, NFLNK },
- { S_IFSOCK, NFSOCK },
- { S_IFIFO, NFFIFO },
- { 0, NFBAD }
+static const umode_t nfs_type2fmt[] = {
+ [NF3BAD] = 0,
+ [NF3REG] = S_IFREG,
+ [NF3DIR] = S_IFDIR,
+ [NF3BLK] = S_IFBLK,
+ [NF3CHR] = S_IFCHR,
+ [NF3LNK] = S_IFLNK,
+ [NF3SOCK] = S_IFSOCK,
+ [NF3FIFO] = S_IFIFO,
};
/*
@@ -148,13 +144,12 @@ static __be32 *
xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
{
unsigned int type, major, minor;
- int fmode;
+ umode_t fmode;
type = ntohl(*p++);
- if (type >= NF3BAD)
- type = NF3BAD;
- fmode = nfs_type2fmt[type].mode;
- fattr->type = nfs_type2fmt[type].nfs2type;
+ if (type > NF3FIFO)
+ type = NF3NON;
+ fmode = nfs_type2fmt[type];
fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
fattr->nlink = ntohl(*p++);
fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
p = xdr_decode_time3(p, &fattr->ctime);
/* Update the mode bits */
- fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+ fattr->valid |= NFS_ATTR_FATTR_V3;
return p;
}
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
p = xdr_decode_hyper(p, &fattr->pre_size);
p = xdr_decode_time3(p, &fattr->pre_mtime);
p = xdr_decode_time3(p, &fattr->pre_ctime);
- fattr->valid |= NFS_ATTR_WCC;
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME;
return p;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d..a4d24268029 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start, KM_USER0);
}
-static int nfs4_wait_bit_killable(void *word)
-{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
- schedule();
- return 0;
-}
-
static int nfs4_wait_clnt_recover(struct nfs_client *clp)
{
int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
might_sleep();
res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
- nfs4_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable, TASK_KILLABLE);
return res;
}
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
if (calldata->arg.seqid == NULL)
goto out_free_calldata;
calldata->arg.fmode = 0;
- calldata->arg.bitmask = server->attr_bitmask;
+ calldata->arg.bitmask = server->cache_consistency_bitmask;
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
@@ -1509,7 +1501,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
attr.ia_mode = nd->intent.open.create_mode;
attr.ia_valid = ATTR_MODE;
if (!IS_POSIXACL(dir))
- attr.ia_mode &= ~current->fs->umask;
+ attr.ia_mode &= ~current_umask();
} else {
attr.ia_valid = 0;
BUG_ON(nd->intent.open.flags & O_CREAT);
@@ -1580,6 +1572,15 @@ out_drop:
return 0;
}
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ if (ctx->state == NULL)
+ return;
+ if (is_sync)
+ nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+ else
+ nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_HARDLINKS;
if (res.has_symlinks != 0)
server->caps |= NFS_CAP_SYMLINKS;
+ memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+ server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+ server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->acl_bitmask = res.acl_bitmask;
}
return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
struct nfs_removeargs *args = msg->rpc_argp;
struct nfs_removeres *res = msg->rpc_resp;
- args->bitmask = server->attr_bitmask;
+ args->bitmask = server->cache_consistency_bitmask;
res->server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
}
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.pages = &page,
.pgbase = 0,
.count = count,
- .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+ .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
};
struct nfs4_readdir_res res;
struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
{
struct nfs_server *server = NFS_SERVER(data->inode);
- data->args.bitmask = server->attr_bitmask;
+ data->args.bitmask = server->cache_consistency_bitmask;
data->res.server = server;
data->timestamp = jiffies;
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
{
struct nfs_server *server = NFS_SERVER(data->inode);
- data->args.bitmask = server->attr_bitmask;
+ data->args.bitmask = server->cache_consistency_bitmask;
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
}
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
return len;
}
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+ if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+ (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+ return;
+
+ fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+ NFS_ATTR_FATTR_NLINK;
+ fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ fattr->nlink = 2;
+}
+
int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
struct nfs4_fs_locations *fs_locations, struct page *page)
{
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
fs_locations->server = server;
fs_locations->nlocations = 0;
status = rpc_call_sync(server->client, &msg, 0);
+ nfs_fixup_referral_attributes(&fs_locations->fattr);
dprintk("%s: returned status = %d\n", __func__, status);
return status;
}
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.commit_done = nfs4_commit_done,
.lock = nfs4_proc_lock,
.clear_acl_cache = nfs4_zap_acl_attr,
+ .close_context = nfs4_close_context,
};
/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966..0298e909559 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
{
- int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
- nfs_callback_tcpport, cred);
+ unsigned short port;
+ int status;
+
+ port = nfs_callback_tcpport;
+ if (clp->cl_addr.ss_family == AF_INET6)
+ port = nfs_callback_tcpport6;
+
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
if (status == 0)
status = nfs4_proc_setclientid_confirm(clp, cred);
if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a..1690f0e44b9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
decode_lookup_maxsz + \
decode_fs_locations_maxsz)
-static struct {
- unsigned int mode;
- unsigned int nfs2type;
-} nfs_type2fmt[] = {
- { 0, NFNON },
- { S_IFREG, NFREG },
- { S_IFDIR, NFDIR },
- { S_IFBLK, NFBLK },
- { S_IFCHR, NFCHR },
- { S_IFLNK, NFLNK },
- { S_IFSOCK, NFSOCK },
- { S_IFIFO, NFFIFO },
- { 0, NFNON },
- { 0, NFNON },
+static const umode_t nfs_type2fmt[] = {
+ [NF4BAD] = 0,
+ [NF4REG] = S_IFREG,
+ [NF4DIR] = S_IFDIR,
+ [NF4BLK] = S_IFBLK,
+ [NF4CHR] = S_IFCHR,
+ [NF4LNK] = S_IFLNK,
+ [NF4SOCK] = S_IFSOCK,
+ [NF4FIFO] = S_IFIFO,
+ [NF4ATTRDIR] = 0,
+ [NF4NAMEDATTR] = 0,
};
struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
{
__be32 *p;
+ int ret = 0;
*type = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
return -EIO;
}
bitmap[0] &= ~FATTR4_WORD0_TYPE;
+ ret = NFS_ATTR_FATTR_TYPE;
}
- dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
- return 0;
+ dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+ return ret;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
{
__be32 *p;
+ int ret = 0;
*change = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
READ_BUF(8);
READ64(*change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+ ret = NFS_ATTR_FATTR_CHANGE;
}
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
- return 0;
+ return ret;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
{
__be32 *p;
+ int ret = 0;
*size = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
READ_BUF(8);
READ64(*size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
+ ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
- return 0;
+ return ret;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
{
__be32 *p;
+ int ret = 0;
fsid->major = 0;
fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
READ64(fsid->major);
READ64(fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
+ ret = NFS_ATTR_FATTR_FSID;
}
dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
- return 0;
+ return ret;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
__be32 *p;
+ int ret = 0;
*fileid = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
READ_BUF(8);
READ64(*fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
+ ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
- return 0;
+ return ret;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
__be32 *p;
+ int ret = 0;
*fileid = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
READ_BUF(8);
READ64(*fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+ ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
- return 0;
+ return ret;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
res->nlocations++;
}
+ if (res->nlocations != 0)
+ status = NFS_ATTR_FATTR_V4_REFERRAL;
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
return status;
}
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
{
+ uint32_t tmp;
__be32 *p;
+ int ret = 0;
*mode = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
READ_BUF(4);
- READ32(*mode);
- *mode &= ~S_IFMT;
+ READ32(tmp);
+ *mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
+ ret = NFS_ATTR_FATTR_MODE;
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
- return 0;
+ return ret;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
{
__be32 *p;
+ int ret = 0;
*nlink = 1;
if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
READ_BUF(4);
READ32(*nlink);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+ ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
- return 0;
+ return ret;
}
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
{
uint32_t len;
__be32 *p;
+ int ret = 0;
*uid = -2;
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
READ32(len);
READ_BUF(len);
if (len < XDR_MAX_NETOBJ) {
- if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
+ if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+ ret = NFS_ATTR_FATTR_OWNER;
+ else
dprintk("%s: nfs_map_name_to_uid failed!\n",
__func__);
} else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
bitmap[1] &= ~FATTR4_WORD1_OWNER;
}
dprintk("%s: uid=%d\n", __func__, (int)*uid);
- return 0;
+ return ret;
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
{
uint32_t len;
__be32 *p;
+ int ret = 0;
*gid = -2;
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
READ32(len);
READ_BUF(len);
if (len < XDR_MAX_NETOBJ) {
- if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
+ if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+ ret = NFS_ATTR_FATTR_GROUP;
+ else
dprintk("%s: nfs_map_group_to_gid failed!\n",
__func__);
} else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
}
dprintk("%s: gid=%d\n", __func__, (int)*gid);
- return 0;
+ return ret;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
{
uint32_t major = 0, minor = 0;
__be32 *p;
+ int ret = 0;
*rdev = MKDEV(0,0);
if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
if (MAJOR(tmp) == major && MINOR(tmp) == minor)
*rdev = tmp;
bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+ ret = NFS_ATTR_FATTR_RDEV;
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
- return 0;
+ return ret;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
{
__be32 *p;
+ int ret = 0;
*used = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
READ_BUF(8);
READ64(*used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+ ret = NFS_ATTR_FATTR_SPACE_USED;
}
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
- return 0;
+ return ret;
}
static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_ATIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
}
dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_CTIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
}
dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_MTIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
}
dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
uint32_t attrlen,
bitmap[2] = {0},
type;
- int status, fmode = 0;
+ int status;
+ umode_t fmode = 0;
uint64_t fileid;
- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
- goto xdr_error;
- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ status = decode_op_hdr(xdr, OP_GETATTR);
+ if (status < 0)
goto xdr_error;
- fattr->bitmap[0] = bitmap[0];
- fattr->bitmap[1] = bitmap[1];
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
+ status = decode_attr_type(xdr, bitmap, &type);
+ if (status < 0)
goto xdr_error;
- fattr->type = nfs_type2fmt[type].nfs2type;
- fmode = nfs_type2fmt[type].mode;
+ fattr->mode = 0;
+ if (status != 0) {
+ fattr->mode |= nfs_type2fmt[type];
+ fattr->valid |= status;
+ }
- if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
+ status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_size(xdr, bitmap, &fattr->size);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+ fattr->valid |= status;
+
+ status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
struct nfs4_fs_locations,
- fattr))) != 0)
+ fattr));
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_mode(xdr, bitmap, &fmode);
+ if (status < 0)
goto xdr_error;
- fattr->mode |= fmode;
- if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
+ if (status != 0) {
+ fattr->mode |= fmode;
+ fattr->valid |= status;
+ }
+
+ status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+ if (status < 0)
goto xdr_error;
- if (fattr->fileid == 0 && fileid != 0)
+ if (status != 0 && !(fattr->valid & status)) {
fattr->fileid = fileid;
- if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
- fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+ fattr->valid |= status;
+ }
+
+ status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
status = decode_setattr(&xdr, res);
if (status)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
- if (status == NFS4ERR_DELAY)
- status = 0;
+ decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70..e2975939126 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
kref_put(&req->wb_kref, nfs_free_request);
}
-static int nfs_wait_bit_killable(void *word)
-{
- int ret = 0;
-
- if (fatal_signal_pending(current))
- ret = -ERESTARTSYS;
- else
- schedule();
- return ret;
-}
-
/**
* nfs_wait_on_request - Wait for a request to complete.
* @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7..7be72d90d49 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.commit_setup = nfs_proc_commit_setup,
.lock = nfs_proc_lock,
.lock_check_bounds = nfs_lock_check_bounds,
+ .close_context = nfs_close_context,
};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f856004bb7f..4ace3c50a8e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,6 +24,7 @@
#include "internal.h"
#include "iostat.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
}
}
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
- struct page *page)
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+ struct page *page)
{
LIST_HEAD(one_request);
struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
static void nfs_readpage_release(struct nfs_page *req)
{
+ struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
unlock_page(req->wb_page);
dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
} else
ctx = get_nfs_open_context(nfs_file_open_context(file));
+ if (!IS_SYNC(inode)) {
+ error = nfs_readpage_from_fscache(ctx, inode, page);
+ if (error == 0)
+ goto out;
+ }
+
error = nfs_readpage_async(ctx, inode, page);
+out:
put_nfs_open_context(ctx);
return error;
out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
return -EBADF;
} else
desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+
+ /* attempt to read as many of the pages as possible from the cache
+ * - this returns -ENOBUFS immediately if the cookie is negative
+ */
+ ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+ pages, &nr_pages);
+ if (ret == 0)
+ goto read_complete; /* all pages were read */
+
if (rsize < PAGE_CACHE_SIZE)
nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
nfs_pageio_complete(&pgio);
npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
put_nfs_open_context(desc.ctx);
out:
return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786d..82eaadbff40 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,6 +60,7 @@
#include "delegation.h"
#include "iostat.h"
#include "internal.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -76,6 +77,7 @@ enum {
Opt_rdirplus, Opt_nordirplus,
Opt_sharecache, Opt_nosharecache,
Opt_resvport, Opt_noresvport,
+ Opt_fscache, Opt_nofscache,
/* Mount options that take integer arguments */
Opt_port,
@@ -93,6 +95,7 @@ enum {
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
Opt_lookupcache,
+ Opt_fscache_uniq,
/* Special mount options */
Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_nosharecache, "nosharecache" },
{ Opt_resvport, "resvport" },
{ Opt_noresvport, "noresvport" },
+ { Opt_fscache, "fsc" },
+ { Opt_fscache_uniq, "fsc=%s" },
+ { Opt_nofscache, "nofsc" },
{ Opt_port, "port=%u" },
{ Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (clp->rpc_ops->version == 4)
seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
#endif
+ if (nfss->options & NFS_OPTION_FSCACHE)
+ seq_printf(m, ",fsc");
}
/*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
totals.events[i] += stats->events[i];
for (i = 0; i < __NFSIOS_BYTESMAX; i++)
totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+ for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+ totals.fscache[i] += stats->fscache[i];
+#endif
preempt_enable();
}
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "\n\tbytes:\t");
for (i = 0; i < __NFSIOS_BYTESMAX; i++)
seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+ if (nfss->options & NFS_OPTION_FSCACHE) {
+ seq_printf(m, "\n\tfsc:\t");
+ for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+ seq_printf(m, "%Lu ", totals.bytes[i]);
+ }
+#endif
seq_printf(m, "\n");
rpc_print_iostats(m, nfss->client);
@@ -1018,6 +1037,7 @@ static int nfs_parse_mount_options(char *raw,
case Opt_rdma:
mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+ xprt_load_transport(p);
break;
case Opt_acl:
mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1043,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
case Opt_noresvport:
mnt->flags |= NFS_MOUNT_NORESVPORT;
break;
+ case Opt_fscache:
+ mnt->options |= NFS_OPTION_FSCACHE;
+ kfree(mnt->fscache_uniq);
+ mnt->fscache_uniq = NULL;
+ break;
+ case Opt_nofscache:
+ mnt->options &= ~NFS_OPTION_FSCACHE;
+ kfree(mnt->fscache_uniq);
+ mnt->fscache_uniq = NULL;
+ break;
+ case Opt_fscache_uniq:
+ string = match_strdup(args);
+ if (!string)
+ goto out_nomem;
+ kfree(mnt->fscache_uniq);
+ mnt->fscache_uniq = string;
+ mnt->options |= NFS_OPTION_FSCACHE;
+ break;
/*
* options that take numeric values
@@ -1205,12 +1243,14 @@ static int nfs_parse_mount_options(char *raw,
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+ xprt_load_transport(string);
break;
default:
errors++;
dfprintk(MOUNT, "NFS: unrecognized "
"transport protocol\n");
}
+ kfree(string);
break;
case Opt_mountproto:
string = match_strdup(args);
@@ -1218,7 +1258,6 @@ static int nfs_parse_mount_options(char *raw,
goto out_nomem;
token = match_token(string,
nfs_xprt_protocol_tokens, args);
- kfree(string);
switch (token) {
case Opt_xprt_udp:
@@ -1868,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
nfs_initialise_sb(sb);
}
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
-
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
{
const struct nfs_server *a = s->s_fs_info;
@@ -2034,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs_fill_super(s, data);
+ nfs_fscache_get_super_cookie(s, data);
}
mntroot = nfs_get_root(s, mntfh);
@@ -2054,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
out:
kfree(data->nfs_server.hostname);
kfree(data->mount_server.hostname);
+ kfree(data->fscache_uniq);
security_free_mnt_opts(&data->lsm_opts);
out_free_fh:
kfree(mntfh);
@@ -2081,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
bdi_unregister(&server->backing_dev_info);
kill_anon_super(s);
+ nfs_fscache_release_super_cookie(s);
nfs_free_server(server);
}
@@ -2388,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
if (!s->s_root) {
/* initial superblock/root creation */
nfs4_fill_super(s);
+ nfs_fscache_get_super_cookie(s, data);
}
mntroot = nfs4_get_root(s, mntfh);
@@ -2409,6 +2450,7 @@ out:
kfree(data->client_address);
kfree(data->nfs_server.export_path);
kfree(data->nfs_server.hostname);
+ kfree(data->fscache_uniq);
security_free_mnt_opts(&data->lsm_opts);
out_free_fh:
kfree(mntfh);
@@ -2435,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
kill_anon_super(sb);
nfs4_renewd_prepare_shutdown(server);
+ nfs_fscache_release_super_cookie(sb);
nfs_free_server(server);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc..e560a78995a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
+ unsigned long *bitlock = &NFS_I(inode)->flags;
struct nfs_pageio_descriptor pgio;
int err;
+ /* Stop dirtying of new pages while we sync */
+ err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (err)
+ goto out_err;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
+
+ clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
if (err < 0)
- return err;
- if (pgio.pg_error < 0)
- return pgio.pg_error;
+ goto out_err;
+ err = pgio.pg_error;
+ if (err < 0)
+ goto out_err;
return 0;
+out_err:
+ return err;
}
/*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
- nfsi->ncommit++;
set_bit(PG_CLEAN, &(req)->wb_flags);
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+ return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
/*
* nfs_scan_commit - Scan an inode for commit requests
* @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
{
struct nfs_inode *nfsi = NFS_I(inode);
- int res = 0;
- if (nfsi->ncommit != 0) {
- res = nfs_scan_list(nfsi, dst, idx_start, npages,
- NFS_PAGE_TAG_COMMIT);
- nfsi->ncommit -= res;
- }
- return res;
+ if (!nfs_need_commit(nfsi))
+ return 0;
+
+ return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
}
#else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+ return 0;
+}
+
static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
{
return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
data->args.stable = NFS_UNSTABLE;
if (how & FLUSH_STABLE) {
data->args.stable = NFS_DATA_SYNC;
- if (!NFS_I(inode)->ncommit)
+ if (!nfs_need_commit(NFS_I(inode)))
data->args.stable = NFS_FILE_SYNC;
}
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
{
struct writeback_control wbc = {
.bdi = mapping->backing_dev_info,
- .sync_mode = WB_SYNC_NONE,
+ .sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.for_writepages = 1,
};
- int ret;
- ret = __nfs_write_mapping(mapping, &wbc, how);
- if (ret < 0)
- return ret;
- wbc.sync_mode = WB_SYNC_ALL;
return __nfs_write_mapping(mapping, &wbc, how);
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce..a4ed8644d69 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
char transport[16];
int port;
if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+ if (port < 1 || port > 65535)
+ return -EINVAL;
err = nfsd_create_serv();
if (!err) {
err = svc_create_xprt(nfsd_serv,
- transport, port,
+ transport, PF_INET, port,
SVC_SOCK_ANONYMOUS);
if (err == -ENOENT)
/* Give a reasonable perror msg for
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
char transport[16];
int port;
if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
- if (port == 0)
+ if (port < 1 || port > 65535)
return -EINVAL;
if (nfsd_serv) {
xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa..7c09852be71 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
atomic_set(&nfsd_busy, 0);
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- AF_INET,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
@@ -244,7 +243,7 @@ static int nfsd_init_socks(int port)
if (!list_empty(&nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nfsd_serv, "udp", port,
+ error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
@@ -253,7 +252,7 @@ static int nfsd_init_socks(int port)
if (error < 0)
return error;
- error = svc_create_xprt(nfsd_serv, "tcp", port,
+ error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
@@ -404,7 +403,6 @@ static int
nfsd(void *vrqstp)
{
struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
- struct fs_struct *fsp;
int err, preverr = 0;
/* Lock module and set up kernel thread */
@@ -413,13 +411,11 @@ nfsd(void *vrqstp)
/* At this point, the thread shares current->fs
* with the init process. We need to create files with a
* umask of 0 instead of init's umask. */
- fsp = copy_fs_struct(current->fs);
- if (!fsp) {
+ if (unshare_fs_struct() < 0) {
printk("Unable to start nfsd thread: out of memory\n");
goto out;
}
- exit_fs(current);
- current->fs = fsp;
+
current->fs->umask = 0;
/*
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd..5a9e34475e3 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
/**
* The little endian Unicode string $I30 as a global constant.
*/
-ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
- const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+ cpu_to_le16('3'), cpu_to_le16('0'), 0 };
/**
* ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0..82c5085559c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
goto em_put_err_out;
next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
le16_to_cpu(al_entry->length));
- if (le32_to_cpu(al_entry->type) >
- const_le32_to_cpu(AT_DATA))
+ if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
goto em_put_err_out;
if (AT_DATA != al_entry->type)
continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328ece..50931b1ce4b 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
#include "types.h"
-/*
- * Constant endianness conversion defines.
- */
-#define const_le16_to_cpu(x) __constant_le16_to_cpu(x)
-#define const_le32_to_cpu(x) __constant_le32_to_cpu(x)
-#define const_le64_to_cpu(x) __constant_le64_to_cpu(x)
-
-#define const_cpu_to_le16(x) __constant_cpu_to_le16(x)
-#define const_cpu_to_le32(x) __constant_cpu_to_le32(x)
-#define const_cpu_to_le64(x) __constant_cpu_to_le64(x)
-
/* The NTFS oem_id "NTFS " */
-#define magicNTFS const_cpu_to_le64(0x202020205346544eULL)
+#define magicNTFS cpu_to_le64(0x202020205346544eULL)
/*
* Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
*/
enum {
/* Found in $MFT/$DATA. */
- magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
- magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
- magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+ magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+ magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+ magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
/* Found in $LogFile/$DATA. */
- magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
- magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+ magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+ magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
/* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
- magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+ magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
/* Found in all ntfs record containing records. */
- magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+ magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
transfer was detected. */
/*
* Found in $LogFile/$DATA when a page is full of 0xff bytes and is
* thus not initialized. Page must be initialized before using it.
*/
- magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+ magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
};
typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
* information about the mft record in which they are present.
*/
enum {
- MFT_RECORD_IN_USE = const_cpu_to_le16(0x0001),
- MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+ MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
+ MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
} __attribute__ ((__packed__));
typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
* Note: The _LE versions will return a CPU endian formatted value!
*/
#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU)
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
typedef u64 MFT_REF;
typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
* a revealing choice of symbol I do not know what is... (-;
*/
enum {
- AT_UNUSED = const_cpu_to_le32( 0),
- AT_STANDARD_INFORMATION = const_cpu_to_le32( 0x10),
- AT_ATTRIBUTE_LIST = const_cpu_to_le32( 0x20),
- AT_FILE_NAME = const_cpu_to_le32( 0x30),
- AT_OBJECT_ID = const_cpu_to_le32( 0x40),
- AT_SECURITY_DESCRIPTOR = const_cpu_to_le32( 0x50),
- AT_VOLUME_NAME = const_cpu_to_le32( 0x60),
- AT_VOLUME_INFORMATION = const_cpu_to_le32( 0x70),
- AT_DATA = const_cpu_to_le32( 0x80),
- AT_INDEX_ROOT = const_cpu_to_le32( 0x90),
- AT_INDEX_ALLOCATION = const_cpu_to_le32( 0xa0),
- AT_BITMAP = const_cpu_to_le32( 0xb0),
- AT_REPARSE_POINT = const_cpu_to_le32( 0xc0),
- AT_EA_INFORMATION = const_cpu_to_le32( 0xd0),
- AT_EA = const_cpu_to_le32( 0xe0),
- AT_PROPERTY_SET = const_cpu_to_le32( 0xf0),
- AT_LOGGED_UTILITY_STREAM = const_cpu_to_le32( 0x100),
- AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32( 0x1000),
- AT_END = const_cpu_to_le32(0xffffffff)
+ AT_UNUSED = cpu_to_le32( 0),
+ AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
+ AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
+ AT_FILE_NAME = cpu_to_le32( 0x30),
+ AT_OBJECT_ID = cpu_to_le32( 0x40),
+ AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
+ AT_VOLUME_NAME = cpu_to_le32( 0x60),
+ AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
+ AT_DATA = cpu_to_le32( 0x80),
+ AT_INDEX_ROOT = cpu_to_le32( 0x90),
+ AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
+ AT_BITMAP = cpu_to_le32( 0xb0),
+ AT_REPARSE_POINT = cpu_to_le32( 0xc0),
+ AT_EA_INFORMATION = cpu_to_le32( 0xd0),
+ AT_EA = cpu_to_le32( 0xe0),
+ AT_PROPERTY_SET = cpu_to_le32( 0xf0),
+ AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
+ AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
+ AT_END = cpu_to_le32(0xffffffff)
};
typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
* equal then the second le32 values would be compared, etc.
*/
enum {
- COLLATION_BINARY = const_cpu_to_le32(0x00),
- COLLATION_FILE_NAME = const_cpu_to_le32(0x01),
- COLLATION_UNICODE_STRING = const_cpu_to_le32(0x02),
- COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10),
- COLLATION_NTOFS_SID = const_cpu_to_le32(0x11),
- COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12),
- COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13),
+ COLLATION_BINARY = cpu_to_le32(0x00),
+ COLLATION_FILE_NAME = cpu_to_le32(0x01),
+ COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
+ COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
+ COLLATION_NTOFS_SID = cpu_to_le32(0x11),
+ COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
+ COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
};
typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
* NT4.
*/
enum {
- ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be
+ ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
indexed. */
- ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type
+ ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
can be present multiple times in the
mft records of an inode. */
- ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value
+ ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
must contain at least one non-zero
byte. */
- ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be
+ ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
indexed and the attribute value must be
unique for the attribute type in all of
the mft records of an inode. */
- ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be
+ ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
named and the name must be unique for
the attribute type in all of the mft
records of an inode. */
- ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be
+ ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
resident. */
- ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log
+ ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
modifications to this attribute,
regardless of whether it is resident or
non-resident. Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
* Attribute flags (16-bit).
*/
enum {
- ATTR_IS_COMPRESSED = const_cpu_to_le16(0x0001),
- ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+ ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
+ ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
mask. Also, first
illegal value. */
- ATTR_IS_ENCRYPTED = const_cpu_to_le16(0x4000),
- ATTR_IS_SPARSE = const_cpu_to_le16(0x8000),
+ ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
+ ATTR_IS_SPARSE = cpu_to_le16(0x8000),
} __attribute__ ((__packed__));
typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
* flags appear in all of the above.
*/
enum {
- FILE_ATTR_READONLY = const_cpu_to_le32(0x00000001),
- FILE_ATTR_HIDDEN = const_cpu_to_le32(0x00000002),
- FILE_ATTR_SYSTEM = const_cpu_to_le32(0x00000004),
- /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */
+ FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
+ FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
+ FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
+ /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
- FILE_ATTR_DIRECTORY = const_cpu_to_le32(0x00000010),
+ FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
reserved for the DOS SUBDIRECTORY flag. */
- FILE_ATTR_ARCHIVE = const_cpu_to_le32(0x00000020),
- FILE_ATTR_DEVICE = const_cpu_to_le32(0x00000040),
- FILE_ATTR_NORMAL = const_cpu_to_le32(0x00000080),
+ FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
+ FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
+ FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
- FILE_ATTR_TEMPORARY = const_cpu_to_le32(0x00000100),
- FILE_ATTR_SPARSE_FILE = const_cpu_to_le32(0x00000200),
- FILE_ATTR_REPARSE_POINT = const_cpu_to_le32(0x00000400),
- FILE_ATTR_COMPRESSED = const_cpu_to_le32(0x00000800),
+ FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
+ FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
+ FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
+ FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
- FILE_ATTR_OFFLINE = const_cpu_to_le32(0x00001000),
- FILE_ATTR_NOT_CONTENT_INDEXED = const_cpu_to_le32(0x00002000),
- FILE_ATTR_ENCRYPTED = const_cpu_to_le32(0x00004000),
+ FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
+ FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
+ FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
- FILE_ATTR_VALID_FLAGS = const_cpu_to_le32(0x00007fb7),
+ FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
FILE_ATTR_DEVICE and preserves everything else. This mask is used
to obtain all flags that are valid for reading. */
- FILE_ATTR_VALID_SET_FLAGS = const_cpu_to_le32(0x000031a7),
+ FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
@@ -846,11 +835,11 @@ enum {
* FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
* attribute of an mft record.
*/
- FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000),
+ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
/* Note, this is a copy of the corresponding bit from the mft record,
telling us whether this is a directory or not, i.e. whether it has
an index root attribute or not. */
- FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000),
+ FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
/* Note, this is a copy of the corresponding bit from the mft record,
telling us whether this file has a view index present (eg. object id
index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
/* Specific rights for files and directories are as follows: */
/* Right to read data from the file. (FILE) */
- FILE_READ_DATA = const_cpu_to_le32(0x00000001),
+ FILE_READ_DATA = cpu_to_le32(0x00000001),
/* Right to list contents of a directory. (DIRECTORY) */
- FILE_LIST_DIRECTORY = const_cpu_to_le32(0x00000001),
+ FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
/* Right to write data to the file. (FILE) */
- FILE_WRITE_DATA = const_cpu_to_le32(0x00000002),
+ FILE_WRITE_DATA = cpu_to_le32(0x00000002),
/* Right to create a file in the directory. (DIRECTORY) */
- FILE_ADD_FILE = const_cpu_to_le32(0x00000002),
+ FILE_ADD_FILE = cpu_to_le32(0x00000002),
/* Right to append data to the file. (FILE) */
- FILE_APPEND_DATA = const_cpu_to_le32(0x00000004),
+ FILE_APPEND_DATA = cpu_to_le32(0x00000004),
/* Right to create a subdirectory. (DIRECTORY) */
- FILE_ADD_SUBDIRECTORY = const_cpu_to_le32(0x00000004),
+ FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
/* Right to read extended attributes. (FILE/DIRECTORY) */
- FILE_READ_EA = const_cpu_to_le32(0x00000008),
+ FILE_READ_EA = cpu_to_le32(0x00000008),
/* Right to write extended attributes. (FILE/DIRECTORY) */
- FILE_WRITE_EA = const_cpu_to_le32(0x00000010),
+ FILE_WRITE_EA = cpu_to_le32(0x00000010),
/* Right to execute a file. (FILE) */
- FILE_EXECUTE = const_cpu_to_le32(0x00000020),
+ FILE_EXECUTE = cpu_to_le32(0x00000020),
/* Right to traverse the directory. (DIRECTORY) */
- FILE_TRAVERSE = const_cpu_to_le32(0x00000020),
+ FILE_TRAVERSE = cpu_to_le32(0x00000020),
/*
* Right to delete a directory and all the files it contains (its
* children), even if the files are read-only. (DIRECTORY)
*/
- FILE_DELETE_CHILD = const_cpu_to_le32(0x00000040),
+ FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
/* Right to read file attributes. (FILE/DIRECTORY) */
- FILE_READ_ATTRIBUTES = const_cpu_to_le32(0x00000080),
+ FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
/* Right to change file attributes. (FILE/DIRECTORY) */
- FILE_WRITE_ATTRIBUTES = const_cpu_to_le32(0x00000100),
+ FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
/*
* The standard rights (bits 16 to 23). These are independent of the
@@ -1489,27 +1478,27 @@ enum {
*/
/* Right to delete the object. */
- DELETE = const_cpu_to_le32(0x00010000),
+ DELETE = cpu_to_le32(0x00010000),
/*
* Right to read the information in the object's security descriptor,
* not including the information in the SACL, i.e. right to read the
* security descriptor and owner.
*/
- READ_CONTROL = const_cpu_to_le32(0x00020000),
+ READ_CONTROL = cpu_to_le32(0x00020000),
/* Right to modify the DACL in the object's security descriptor. */
- WRITE_DAC = const_cpu_to_le32(0x00040000),
+ WRITE_DAC = cpu_to_le32(0x00040000),
/* Right to change the owner in the object's security descriptor. */
- WRITE_OWNER = const_cpu_to_le32(0x00080000),
+ WRITE_OWNER = cpu_to_le32(0x00080000),
/*
* Right to use the object for synchronization. Enables a process to
* wait until the object is in the signalled state. Some object types
* do not support this access right.
*/
- SYNCHRONIZE = const_cpu_to_le32(0x00100000),
+ SYNCHRONIZE = cpu_to_le32(0x00100000),
/*
* The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
*/
/* These are currently defined to READ_CONTROL. */
- STANDARD_RIGHTS_READ = const_cpu_to_le32(0x00020000),
- STANDARD_RIGHTS_WRITE = const_cpu_to_le32(0x00020000),
- STANDARD_RIGHTS_EXECUTE = const_cpu_to_le32(0x00020000),
+ STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
+ STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
+ STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
- STANDARD_RIGHTS_REQUIRED = const_cpu_to_le32(0x000f0000),
+ STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
/*
* Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
* SYNCHRONIZE access.
*/
- STANDARD_RIGHTS_ALL = const_cpu_to_le32(0x001f0000),
+ STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
/*
* The access system ACL and maximum allowed access types (bits 24 to
* 25, bits 26 to 27 are reserved).
*/
- ACCESS_SYSTEM_SECURITY = const_cpu_to_le32(0x01000000),
- MAXIMUM_ALLOWED = const_cpu_to_le32(0x02000000),
+ ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
+ MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
/*
* The generic rights (bits 28 to 31). These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
*/
/* Read, write, and execute access. */
- GENERIC_ALL = const_cpu_to_le32(0x10000000),
+ GENERIC_ALL = cpu_to_le32(0x10000000),
/* Execute access. */
- GENERIC_EXECUTE = const_cpu_to_le32(0x20000000),
+ GENERIC_EXECUTE = cpu_to_le32(0x20000000),
/*
* Write access. For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
* For directories, the mapping has the same numerical value. See
* above for the descriptions of the rights granted.
*/
- GENERIC_WRITE = const_cpu_to_le32(0x40000000),
+ GENERIC_WRITE = cpu_to_le32(0x40000000),
/*
* Read access. For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
* For directories, the mapping has the same numberical value. See
* above for the descriptions of the rights granted.
*/
- GENERIC_READ = const_cpu_to_le32(0x80000000),
+ GENERIC_READ = cpu_to_le32(0x80000000),
};
typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
* The object ACE flags (32-bit).
*/
enum {
- ACE_OBJECT_TYPE_PRESENT = const_cpu_to_le32(1),
- ACE_INHERITED_OBJECT_TYPE_PRESENT = const_cpu_to_le32(2),
+ ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
+ ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
};
typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
* expressed as offsets from the beginning of the security descriptor.
*/
enum {
- SE_OWNER_DEFAULTED = const_cpu_to_le16(0x0001),
- SE_GROUP_DEFAULTED = const_cpu_to_le16(0x0002),
- SE_DACL_PRESENT = const_cpu_to_le16(0x0004),
- SE_DACL_DEFAULTED = const_cpu_to_le16(0x0008),
-
- SE_SACL_PRESENT = const_cpu_to_le16(0x0010),
- SE_SACL_DEFAULTED = const_cpu_to_le16(0x0020),
-
- SE_DACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0100),
- SE_SACL_AUTO_INHERIT_REQ = const_cpu_to_le16(0x0200),
- SE_DACL_AUTO_INHERITED = const_cpu_to_le16(0x0400),
- SE_SACL_AUTO_INHERITED = const_cpu_to_le16(0x0800),
-
- SE_DACL_PROTECTED = const_cpu_to_le16(0x1000),
- SE_SACL_PROTECTED = const_cpu_to_le16(0x2000),
- SE_RM_CONTROL_VALID = const_cpu_to_le16(0x4000),
- SE_SELF_RELATIVE = const_cpu_to_le16(0x8000)
+ SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
+ SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
+ SE_DACL_PRESENT = cpu_to_le16(0x0004),
+ SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
+
+ SE_SACL_PRESENT = cpu_to_le16(0x0010),
+ SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
+
+ SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
+ SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
+ SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
+ SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
+
+ SE_DACL_PROTECTED = cpu_to_le16(0x1000),
+ SE_SACL_PROTECTED = cpu_to_le16(0x2000),
+ SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
+ SE_SELF_RELATIVE = cpu_to_le16(0x8000)
} __attribute__ ((__packed__));
typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
* Possible flags for the volume (16-bit).
*/
enum {
- VOLUME_IS_DIRTY = const_cpu_to_le16(0x0001),
- VOLUME_RESIZE_LOG_FILE = const_cpu_to_le16(0x0002),
- VOLUME_UPGRADE_ON_MOUNT = const_cpu_to_le16(0x0004),
- VOLUME_MOUNTED_ON_NT4 = const_cpu_to_le16(0x0008),
+ VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
+ VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
+ VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
+ VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
- VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010),
- VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020),
+ VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
+ VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
- VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000),
- VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000),
+ VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
+ VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
- VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f),
+ VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
/* To make our life easier when checking if we must mount read-only. */
- VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027),
+ VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
} __attribute__ ((__packed__));
typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
* The user quota flags. Names explain meaning.
*/
enum {
- QUOTA_FLAG_DEFAULT_LIMITS = const_cpu_to_le32(0x00000001),
- QUOTA_FLAG_LIMIT_REACHED = const_cpu_to_le32(0x00000002),
- QUOTA_FLAG_ID_DELETED = const_cpu_to_le32(0x00000004),
+ QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
+ QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
+ QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
- QUOTA_FLAG_USER_MASK = const_cpu_to_le32(0x00000007),
+ QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
/* This is a bit mask for the user quota flags. */
/*
* These flags are only present in the quota defaults index entry, i.e.
* in the entry where owner_id = QUOTA_DEFAULTS_ID.
*/
- QUOTA_FLAG_TRACKING_ENABLED = const_cpu_to_le32(0x00000010),
- QUOTA_FLAG_ENFORCEMENT_ENABLED = const_cpu_to_le32(0x00000020),
- QUOTA_FLAG_TRACKING_REQUESTED = const_cpu_to_le32(0x00000040),
- QUOTA_FLAG_LOG_THRESHOLD = const_cpu_to_le32(0x00000080),
-
- QUOTA_FLAG_LOG_LIMIT = const_cpu_to_le32(0x00000100),
- QUOTA_FLAG_OUT_OF_DATE = const_cpu_to_le32(0x00000200),
- QUOTA_FLAG_CORRUPT = const_cpu_to_le32(0x00000400),
- QUOTA_FLAG_PENDING_DELETES = const_cpu_to_le32(0x00000800),
+ QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
+ QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
+ QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
+ QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
+
+ QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
+ QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
+ QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
+ QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
};
typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
* Predefined owner_id values (32-bit).
*/
enum {
- QUOTA_INVALID_ID = const_cpu_to_le32(0x00000000),
- QUOTA_DEFAULTS_ID = const_cpu_to_le32(0x00000001),
- QUOTA_FIRST_USER_ID = const_cpu_to_le32(0x00000100),
+ QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
+ QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
+ QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
};
/*
@@ -2189,14 +2178,14 @@ typedef enum {
* Index entry flags (16-bit).
*/
enum {
- INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+ INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
sub-node, i.e. a reference to an index block in form of
a virtual cluster number (see below). */
- INDEX_ENTRY_END = const_cpu_to_le16(2), /* This signifies the last
+ INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
entry in an index block. The index entry does not
represent a file but it can point to a sub-node. */
- INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+ INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
enum bit width to 16-bit. */
} __attribute__ ((__packed__));
@@ -2334,26 +2323,26 @@ typedef struct {
* These are the predefined reparse point tags:
*/
enum {
- IO_REPARSE_TAG_IS_ALIAS = const_cpu_to_le32(0x20000000),
- IO_REPARSE_TAG_IS_HIGH_LATENCY = const_cpu_to_le32(0x40000000),
- IO_REPARSE_TAG_IS_MICROSOFT = const_cpu_to_le32(0x80000000),
+ IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
+ IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
+ IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
- IO_REPARSE_TAG_RESERVED_ZERO = const_cpu_to_le32(0x00000000),
- IO_REPARSE_TAG_RESERVED_ONE = const_cpu_to_le32(0x00000001),
- IO_REPARSE_TAG_RESERVED_RANGE = const_cpu_to_le32(0x00000001),
+ IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
+ IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
+ IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
- IO_REPARSE_TAG_NSS = const_cpu_to_le32(0x68000005),
- IO_REPARSE_TAG_NSS_RECOVER = const_cpu_to_le32(0x68000006),
- IO_REPARSE_TAG_SIS = const_cpu_to_le32(0x68000007),
- IO_REPARSE_TAG_DFS = const_cpu_to_le32(0x68000008),
+ IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
+ IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
+ IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
+ IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
- IO_REPARSE_TAG_MOUNT_POINT = const_cpu_to_le32(0x88000003),
+ IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
- IO_REPARSE_TAG_HSM = const_cpu_to_le32(0xa8000004),
+ IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
- IO_REPARSE_TAG_SYMBOLIC_LINK = const_cpu_to_le32(0xe8000000),
+ IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
- IO_REPARSE_TAG_VALID_VALUES = const_cpu_to_le32(0xe000ffff),
+ IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
};
/*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae..b5a6f08bd35 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
* in this particular client array. Also inside the client records themselves,
* this means that there are no client records preceding or following this one.
*/
-#define LOGFILE_NO_CLIENT const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
#define LOGFILE_NO_CLIENT_CPU 0xffff
/*
@@ -112,8 +112,8 @@ typedef struct {
* information about the log file in which they are present.
*/
enum {
- RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002),
- RESTART_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+ RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
+ RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
} __attribute__ ((__packed__));
typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc3..23bf68453d7 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
*/
/* Mark the mft record as not in use. */
- m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+ m->flags &= ~MFT_RECORD_IN_USE;
/* Increment the sequence number, skipping zero, if it is not zero. */
old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b507..f76951dcd4a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
* many BIOSes will refuse to boot from a bootsector if the magic is
* incorrect, so we emit a warning.
*/
- if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55))
+ if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
ntfs_warning(sb, "Invalid end of sector marker.");
return true;
not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
u32 *kaddr, *kend;
ntfs_name *name = NULL;
int ret = 1;
- static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'),
- const_cpu_to_le16('i'), const_cpu_to_le16('b'),
- const_cpu_to_le16('e'), const_cpu_to_le16('r'),
- const_cpu_to_le16('f'), const_cpu_to_le16('i'),
- const_cpu_to_le16('l'), const_cpu_to_le16('.'),
- const_cpu_to_le16('s'), const_cpu_to_le16('y'),
- const_cpu_to_le16('s'), 0 };
+ static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+ cpu_to_le16('i'), cpu_to_le16('b'),
+ cpu_to_le16('e'), cpu_to_le16('r'),
+ cpu_to_le16('f'), cpu_to_le16('i'),
+ cpu_to_le16('l'), cpu_to_le16('.'),
+ cpu_to_le16('s'), cpu_to_le16('y'),
+ cpu_to_le16('s'), 0 };
ntfs_debug("Entering.");
/*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
goto iput_out;
}
kaddr = (u32*)page_address(page);
- if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) {
+ if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
"hibernated on the volume. This is the "
"system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
MFT_REF mref;
struct inode *tmp_ino;
ntfs_name *name = NULL;
- static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
- const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
- const_cpu_to_le16('o'), const_cpu_to_le16('t'),
- const_cpu_to_le16('a'), 0 };
- static ntfschar Q[3] = { const_cpu_to_le16('$'),
- const_cpu_to_le16('Q'), 0 };
+ static const ntfschar Quota[7] = { cpu_to_le16('$'),
+ cpu_to_le16('Q'), cpu_to_le16('u'),
+ cpu_to_le16('o'), cpu_to_le16('t'),
+ cpu_to_le16('a'), 0 };
+ static ntfschar Q[3] = { cpu_to_le16('$'),
+ cpu_to_le16('Q'), 0 };
ntfs_debug("Entering.");
/*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
struct page *page;
ntfs_name *name = NULL;
USN_HEADER *uh;
- static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'),
- const_cpu_to_le16('U'), const_cpu_to_le16('s'),
- const_cpu_to_le16('n'), const_cpu_to_le16('J'),
- const_cpu_to_le16('r'), const_cpu_to_le16('n'),
- const_cpu_to_le16('l'), 0 };
- static ntfschar Max[5] = { const_cpu_to_le16('$'),
- const_cpu_to_le16('M'), const_cpu_to_le16('a'),
- const_cpu_to_le16('x'), 0 };
- static ntfschar J[3] = { const_cpu_to_le16('$'),
- const_cpu_to_le16('J'), 0 };
+ static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+ cpu_to_le16('U'), cpu_to_le16('s'),
+ cpu_to_le16('n'), cpu_to_le16('J'),
+ cpu_to_le16('r'), cpu_to_le16('n'),
+ cpu_to_le16('l'), 0 };
+ static ntfschar Max[5] = { cpu_to_le16('$'),
+ cpu_to_le16('M'), cpu_to_le16('a'),
+ cpu_to_le16('x'), 0 };
+ static ntfschar J[3] = { cpu_to_le16('$'),
+ cpu_to_le16('J'), 0 };
ntfs_debug("Entering.");
/*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac32..00d8e6bd7c3 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
* documentation: http://www.linux-ntfs.org/
*/
enum {
- USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001),
- USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002),
- USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004),
- USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010),
- USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020),
- USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040),
- USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100),
- USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200),
- USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400),
- USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800),
- USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000),
- USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000),
- USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000),
- USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000),
- USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000),
- USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000),
- USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000),
- USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000),
- USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000),
- USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000),
- USN_REASON_CLOSE = const_cpu_to_le32(0x80000000),
+ USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
+ USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
+ USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
+ USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
+ USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
+ USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+ USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
+ USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
+ USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
+ USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
+ USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
+ USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
+ USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
+ USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
+ USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
+ USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
+ USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
+ USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
+ USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
+ USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
+ USN_REASON_CLOSE = cpu_to_le32(0x80000000),
};
typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
* http://www.linux-ntfs.org/
*/
enum {
- USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001),
- USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002),
- USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004),
+ USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
+ USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
+ USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
};
typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e..fbeaec76210 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
return PTR_ERR(acl);
}
if (!acl)
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
struct posix_acl *clone;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713e..b606496b72e 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
return ret;
}
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct buffer_head *di_bh = NULL;
sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
if (ret2 < 0)
mlog_errno(ret2);
-
+ if (ret)
+ ret = VM_FAULT_SIGBUS;
return ret;
}
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972b..379ae5fb441 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -262,14 +262,19 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *s = dentry->d_sb;
struct omfs_sb_info *sbi = OMFS_SB(s);
+ u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
buf->f_type = OMFS_MAGIC;
buf->f_bsize = sbi->s_blocksize;
buf->f_blocks = sbi->s_num_blocks;
buf->f_files = sbi->s_num_blocks;
buf->f_namelen = OMFS_NAMELEN;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_bfree = buf->f_bavail = buf->f_ffree =
omfs_count_free(s);
+
return 0;
}
@@ -421,7 +426,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_uid = current_uid();
sbi->s_gid = current_gid();
- sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+ sbi->s_dmask = sbi->s_fmask = current_umask();
if (!parse_options((char *) data, sbi))
goto end;
diff --git a/fs/open.c b/fs/open.c
index 75b61677daa..377eb25b6ab 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
+#include <linux/fs_struct.h>
int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b68..f71559784bf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
#include "internal.h"
/* NOTE:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384..74ea974f5ca 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeram-i.freehigh),
#endif
#ifndef CONFIG_MMU
- K((unsigned long) atomic_read(&mmap_pages_allocated)),
+ K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
#endif
K(i.totalswap),
K(i.freeswap),
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae6..83adcc86943 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
{
struct proc_dir_entry *ent;
- if (!driver->ops->read_proc || !driver->driver_name ||
- driver->proc_entry)
+ if (!driver->driver_name || driver->proc_entry ||
+ !driver->ops->proc_fops)
return;
- ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
- if (!ent)
- return;
- ent->read_proc = driver->ops->read_proc;
- ent->data = driver;
-
+ ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
+ driver->ops->proc_fops, driver);
driver->proc_entry = ent;
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc..863464d5519 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
#include <linux/mount.h>
#include <linux/ptrace.h>
#include <linux/seq_file.h>
@@ -49,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
else
bytes += kobjsize(mm);
- if (current->fs && atomic_read(&current->fs->count) > 1)
+ if (current->fs && current->fs->users > 1)
sbytes += kobjsize(current->fs);
else
bytes += kobjsize(current->fs);
@@ -136,14 +137,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
}
seq_printf(m,
- "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
vma->vm_start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
- vma->vm_pgoff << PAGE_SHIFT,
+ (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
MAJOR(dev), MINOR(dev), ino, &len);
if (file) {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2aad1044b84..fe1f0f31d11 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -282,6 +282,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
lock_kernel();
@@ -291,6 +292,8 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree = qnx4_count_free_blocks(sb);
buf->f_bavail = buf->f_bfree;
buf->f_namelen = QNX4_NAME_MAX;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
unlock_kernel();
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 2ca967a5ef7..607c579e5ec 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -823,7 +823,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
spin_lock(&inode_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
continue;
if (!atomic_read(&inode->i_writecount))
continue;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 995ef1d6686..ebb2c417912 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
*/
int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
{
- struct pagevec lru_pvec;
unsigned long npages, xpages, loop, limit;
struct page *pages;
unsigned order;
@@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
memset(data, 0, newsize);
/* attach all the pages to the inode's address space */
- pagevec_init(&lru_pvec, 0);
for (loop = 0; loop < npages; loop++) {
struct page *page = pages + loop;
- ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+ GFP_KERNEL);
if (ret < 0)
goto add_error;
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add_file(&lru_pvec);
-
/* prevent the page from being discarded on memory pressure */
SetPageDirty(page);
unlock_page(page);
}
- pagevec_lru_add_file(&lru_pvec);
return 0;
fsize_exceeded:
@@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
return -EFBIG;
add_error:
- pagevec_lru_add_file(&lru_pvec);
- page_cache_release(pages + loop);
- for (loop++; loop < npages; loop++)
- __free_page(pages + loop);
+ while (loop < npages)
+ __free_page(pages + loop++);
return ret;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b8..a404fb88e45 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
#include <linux/backing-dev.h>
#include <linux/ramfs.h>
#include <linux/sched.h>
+#include <linux/parser.h>
#include <asm/uaccess.h>
#include "internal.h"
/* some random number */
#define RAMFS_MAGIC 0x858458f6
+#define RAMFS_DEFAULT_MODE 0755
+
static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
@@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = {
static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
+ .show_options = generic_show_options,
+};
+
+struct ramfs_mount_opts {
+ umode_t mode;
+};
+
+enum {
+ Opt_mode,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_mode, "mode=%o"},
+ {Opt_err, NULL}
+};
+
+struct ramfs_fs_info {
+ struct ramfs_mount_opts mount_opts;
};
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int token;
+ char *p;
+
+ opts->mode = RAMFS_DEFAULT_MODE;
+
+ while ((p = strsep(&data, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ default:
+ printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
{
- struct inode * inode;
- struct dentry * root;
+ struct ramfs_fs_info *fsi;
+ struct inode *inode = NULL;
+ struct dentry *root;
+ int err;
+
+ save_mount_options(sb, data);
+
+ fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+ if (!fsi) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ sb->s_fs_info = fsi;
+
+ err = ramfs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
- inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
- if (!inode)
- return -ENOMEM;
+ inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+ if (!inode) {
+ err = -ENOMEM;
+ goto fail;
+ }
root = d_alloc_root(inode);
if (!root) {
- iput(inode);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto fail;
}
sb->s_root = root;
return 0;
+fail:
+ kfree(fsi);
+ iput(inode);
+ return err;
}
int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
mnt);
}
+static void ramfs_kill_sb(struct super_block *sb)
+{
+ kfree(sb->s_fs_info);
+ kill_litter_super(sb);
+}
+
static struct file_system_type ramfs_fs_type = {
.name = "ramfs",
.get_sb = ramfs_get_sb,
- .kill_sb = kill_litter_super,
+ .kill_sb = ramfs_kill_sb,
};
static struct file_system_type rootfs_fs_type = {
.name = "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index 400fe81c973..6d5d8ff238a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -731,6 +731,56 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, u32, pos_high, u32, pos_low)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ struct file *file;
+ ssize_t ret = -EBADF;
+ int fput_needed;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ file = fget_light(fd, &fput_needed);
+ if (file) {
+ ret = -ESPIPE;
+ if (file->f_mode & FMODE_PREAD)
+ ret = vfs_readv(file, vec, vlen, &pos);
+ fput_light(file, fput_needed);
+ }
+
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, u32, pos_high, u32, pos_low)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+ struct file *file;
+ ssize_t ret = -EBADF;
+ int fput_needed;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ file = fget_light(fd, &fput_needed);
+ if (file) {
+ ret = -ESPIPE;
+ if (file->f_mode & FMODE_PWRITE)
+ ret = vfs_writev(file, vec, vlen, &pos);
+ fput_light(file, fput_needed);
+ }
+
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
{
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 949b8c6addc..513f431038f 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,5 +1,6 @@
config REISERFS_FS
tristate "Reiserfs support"
+ select CRC32
help
Stores not just filenames but the files themselves in a balanced
tree. Uses journalling.
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 972250c6289..0ae6486d904 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,6 +27,7 @@
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/namei.h>
+#include <linux/crc32.h>
struct file_system_type reiserfs_fs_type;
@@ -1904,6 +1905,10 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = dentry->d_sb->s_blocksize;
/* changed to accommodate gcc folks. */
buf->f_type = REISERFS_SUPER_MAGIC;
+ buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
+ buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
+ sizeof(rs->s_uuid)/2);
+
return 0;
}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d..c303c426fe2 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
} else {
apply_umask:
/* no ACL, apply umask */
- inode->i_mode &= ~current->fs->umask;
+ inode->i_mode &= ~current_umask();
}
return err;
diff --git a/fs/splice.c b/fs/splice.c
index 4ed0ba44a96..dd727d43e5b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
*/
wait_on_page_writeback(page);
- if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
goto out_unlock;
/*
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 681ec0d8379..ffa6edcd2d0 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -301,6 +301,7 @@ failure:
static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+ u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
TRACE("Entered squashfs_statfs\n");
@@ -311,6 +312,8 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = msblk->inodes;
buf->f_ffree = 0;
buf->f_namelen = SQUASHFS_NAME_LEN;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
return 0;
}
diff --git a/fs/super.c b/fs/super.c
index 2ba481518ba..77cb4ec919b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb)
__fsync_super(sb);
return sync_blockdev(sb->s_bdev);
}
+EXPORT_SYMBOL_GPL(fsync_super);
/**
* generic_shutdown_super - common helper for ->kill_sb()
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 07703d3ff4a..93e0c0281d4 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -234,7 +234,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return ret;
}
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
struct bin_buffer *bb = file->private_data;
@@ -242,15 +242,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
int ret;
if (!bb->vm_ops)
- return -EINVAL;
+ return VM_FAULT_SIGBUS;
if (!bb->vm_ops->page_mkwrite)
return 0;
if (!sysfs_get_active_two(attr_sd))
- return -EINVAL;
+ return VM_FAULT_SIGBUS;
- ret = bb->vm_ops->page_mkwrite(vma, page);
+ ret = bb->vm_ops->page_mkwrite(vma, vmf);
sysfs_put_active_two(attr_sd);
return ret;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3d81bf58dae..da20b48d350 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -90,6 +90,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct sysv_sb_info *sbi = SYSV_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = sb->s_blocksize;
@@ -98,6 +99,8 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = sbi->s_ninodes;
buf->f_ffree = sysv_count_free_inodes(sb);
buf->f_namelen = SYSV_NAMELEN;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
return 0;
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f26..0ff89fe71e5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
* mmap()d file has taken write protection fault and is being made
* writable. UBIFS must ensure page is budgeted for.
*/
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
if (unlikely(c->ro_media))
- return -EROFS;
+ return VM_FAULT_SIGBUS; /* -EROFS */
/*
* We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
if (err == -ENOSPC)
ubifs_warn("out of space for mmapped file "
"(inode number %lu)", inode->i_ino);
- return err;
+ return VM_FAULT_SIGBUS;
}
lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
out_unlock:
unlock_page(page);
ubifs_release_budget(c, &req);
+ if (err)
+ err = VM_FAULT_SIGBUS;
return err;
}
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 2bb788a2acb..e48e9a3af76 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
{
struct buffer_head *bh = NULL;
int retval = 0;
- kernel_lb_addr loc;
+ struct kernel_lb_addr loc;
loc.logicalBlockNum = bitmap->s_extPosition;
loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
- bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block));
+ bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
if (!bh)
retval = -EIO;
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
return slot;
}
-static bool udf_add_free_space(struct udf_sb_info *sbi,
- u16 partition, u32 cnt)
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
{
+ struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDesc *lvid;
- if (sbi->s_lvid_bh == NULL)
- return false;
+ if (!sbi->s_lvid_bh)
+ return;
lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
- return true;
+ udf_updated_lvid(sb);
}
static void udf_bitmap_free_blocks(struct super_block *sb,
struct inode *inode,
struct udf_bitmap *bitmap,
- kernel_lb_addr bloc, uint32_t offset,
+ struct kernel_lb_addr *bloc,
+ uint32_t offset,
uint32_t count)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = NULL;
+ struct udf_part_map *partmap;
unsigned long block;
unsigned long block_group;
unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
unsigned long overflow;
mutex_lock(&sbi->s_alloc_mutex);
- if (bloc.logicalBlockNum < 0 ||
- (bloc.logicalBlockNum + count) >
- sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+ partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+ if (bloc->logicalBlockNum < 0 ||
+ (bloc->logicalBlockNum + count) >
+ partmap->s_partition_len) {
udf_debug("%d < %d || %d + %d > %d\n",
- bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
- sbi->s_partmaps[bloc.partitionReferenceNum].
- s_partition_len);
+ bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
+ count, partmap->s_partition_len);
goto error_return;
}
- block = bloc.logicalBlockNum + offset +
+ block = bloc->logicalBlockNum + offset +
(sizeof(struct spaceBitmapDesc) << 3);
do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
} else {
if (inode)
vfs_dq_free_block(inode, 1);
- udf_add_free_space(sbi, sbi->s_partition, 1);
+ udf_add_free_space(sb, sbi->s_partition, 1);
}
}
mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
} while (overflow);
error_return:
- sb->s_dirt = 1;
- if (sbi->s_lvid_bh)
- mark_buffer_dirty(sbi->s_lvid_bh);
mutex_unlock(&sbi->s_alloc_mutex);
}
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
} while (block_count > 0);
out:
- if (udf_add_free_space(sbi, partition, -alloc_count))
- mark_buffer_dirty(sbi->s_lvid_bh);
- sb->s_dirt = 1;
+ udf_add_free_space(sb, partition, -alloc_count);
mutex_unlock(&sbi->s_alloc_mutex);
return alloc_count;
}
@@ -409,9 +406,7 @@ got_block:
mark_buffer_dirty(bh);
- if (udf_add_free_space(sbi, partition, -1))
- mark_buffer_dirty(sbi->s_lvid_bh);
- sb->s_dirt = 1;
+ udf_add_free_space(sb, partition, -1);
mutex_unlock(&sbi->s_alloc_mutex);
*err = 0;
return newblock;
@@ -425,26 +420,28 @@ error_return:
static void udf_table_free_blocks(struct super_block *sb,
struct inode *inode,
struct inode *table,
- kernel_lb_addr bloc, uint32_t offset,
+ struct kernel_lb_addr *bloc,
+ uint32_t offset,
uint32_t count)
{
struct udf_sb_info *sbi = UDF_SB(sb);
+ struct udf_part_map *partmap;
uint32_t start, end;
uint32_t elen;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
struct extent_position oepos, epos;
int8_t etype;
int i;
struct udf_inode_info *iinfo;
mutex_lock(&sbi->s_alloc_mutex);
- if (bloc.logicalBlockNum < 0 ||
- (bloc.logicalBlockNum + count) >
- sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+ partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+ if (bloc->logicalBlockNum < 0 ||
+ (bloc->logicalBlockNum + count) >
+ partmap->s_partition_len) {
udf_debug("%d < %d || %d + %d > %d\n",
bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
- sbi->s_partmaps[bloc.partitionReferenceNum].
- s_partition_len);
+ partmap->s_partition_len);
goto error_return;
}
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
could occure, but.. oh well */
if (inode)
vfs_dq_free_block(inode, count);
- if (udf_add_free_space(sbi, sbi->s_partition, count))
- mark_buffer_dirty(sbi->s_lvid_bh);
+ udf_add_free_space(sb, sbi->s_partition, count);
- start = bloc.logicalBlockNum + offset;
- end = bloc.logicalBlockNum + offset + count - 1;
+ start = bloc->logicalBlockNum + offset;
+ end = bloc->logicalBlockNum + offset + count - 1;
epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
start += count;
count = 0;
}
- udf_write_aext(table, &oepos, eloc, elen, 1);
+ udf_write_aext(table, &oepos, &eloc, elen, 1);
} else if (eloc.logicalBlockNum == (end + 1)) {
if ((0x3FFFFFFF - elen) <
(count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
end -= count;
count = 0;
}
- udf_write_aext(table, &oepos, eloc, elen, 1);
+ udf_write_aext(table, &oepos, &eloc, elen, 1);
}
if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
*/
int adsize;
- short_ad *sad = NULL;
- long_ad *lad = NULL;
+ struct short_ad *sad = NULL;
+ struct long_ad *lad = NULL;
struct allocExtDesc *aed;
eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
(count << sb->s_blocksize_bits);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else {
brelse(oepos.bh);
brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
elen -= sb->s_blocksize;
epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, epos.block, 0));
+ udf_get_lb_pblock(sb, &epos.block, 0));
if (!epos.bh) {
brelse(oepos.bh);
goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
if (sbi->s_udfrev >= 0x0200)
udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
3, 1, epos.block.logicalBlockNum,
- sizeof(tag));
+ sizeof(struct tag));
else
udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
2, 1, epos.block.logicalBlockNum,
- sizeof(tag));
+ sizeof(struct tag));
switch (iinfo->i_alloc_type) {
case ICBTAG_FLAG_AD_SHORT:
- sad = (short_ad *)sptr;
+ sad = (struct short_ad *)sptr;
sad->extLength = cpu_to_le32(
EXT_NEXT_EXTENT_ALLOCDECS |
sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
cpu_to_le32(epos.block.logicalBlockNum);
break;
case ICBTAG_FLAG_AD_LONG:
- lad = (long_ad *)sptr;
+ lad = (struct long_ad *)sptr;
lad->extLength = cpu_to_le32(
EXT_NEXT_EXTENT_ALLOCDECS |
sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
/* It's possible that stealing the block emptied the extent */
if (elen) {
- udf_write_aext(table, &epos, eloc, elen, 1);
+ udf_write_aext(table, &epos, &eloc, elen, 1);
if (!epos.bh) {
iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
brelse(oepos.bh);
error_return:
- sb->s_dirt = 1;
mutex_unlock(&sbi->s_alloc_mutex);
return;
}
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
struct udf_sb_info *sbi = UDF_SB(sb);
int alloc_count = 0;
uint32_t elen, adsize;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
struct extent_position epos;
int8_t etype = -1;
struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
iinfo = UDF_I(table);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
return 0;
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
alloc_count = block_count;
eloc.logicalBlockNum += alloc_count;
elen -= (alloc_count << sb->s_blocksize_bits);
- udf_write_aext(table, &epos, eloc,
+ udf_write_aext(table, &epos, &eloc,
(etype << 30) | elen, 1);
} else
udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
brelse(epos.bh);
- if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) {
- mark_buffer_dirty(sbi->s_lvid_bh);
- sb->s_dirt = 1;
- }
+ if (alloc_count)
+ udf_add_free_space(sb, partition, -alloc_count);
mutex_unlock(&sbi->s_alloc_mutex);
return alloc_count;
}
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
uint32_t newblock = 0, adsize;
uint32_t elen, goal_elen = 0;
- kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+ struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
struct extent_position epos, goal_epos;
int8_t etype;
struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
*err = -ENOSPC;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
return newblock;
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
}
if (goal_elen)
- udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1);
+ udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
else
udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
brelse(goal_epos.bh);
- if (udf_add_free_space(sbi, partition, -1))
- mark_buffer_dirty(sbi->s_lvid_bh);
+ udf_add_free_space(sb, partition, -1);
- sb->s_dirt = 1;
mutex_unlock(&sbi->s_alloc_mutex);
*err = 0;
return newblock;
}
-inline void udf_free_blocks(struct super_block *sb,
- struct inode *inode,
- kernel_lb_addr bloc, uint32_t offset,
- uint32_t count)
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
+ struct kernel_lb_addr *bloc, uint32_t offset,
+ uint32_t count)
{
- uint16_t partition = bloc.partitionReferenceNum;
+ uint16_t partition = bloc->partitionReferenceNum;
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
- return udf_bitmap_free_blocks(sb, inode,
- map->s_uspace.s_bitmap,
- bloc, offset, count);
+ udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
+ bloc, offset, count);
} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
- return udf_table_free_blocks(sb, inode,
- map->s_uspace.s_table,
- bloc, offset, count);
+ udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
+ bloc, offset, count);
} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
- return udf_bitmap_free_blocks(sb, inode,
- map->s_fspace.s_bitmap,
- bloc, offset, count);
+ udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
+ bloc, offset, count);
} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
- return udf_table_free_blocks(sb, inode,
- map->s_fspace.s_table,
- bloc, offset, count);
- } else {
- return;
+ udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
+ bloc, offset, count);
}
}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 62dc270c69d..2efd4d5291b 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
uint8_t lfi;
loff_t size = udf_ext0_offset(dir) + dir->i_size;
struct buffer_head *tmp, *bha[16];
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
sector_t offset;
int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
ret = -ENOENT;
goto out;
}
- block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+ block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- epos.offset -= sizeof(short_ad);
+ epos.offset -= sizeof(struct short_ad);
else if (iinfo->i_alloc_type ==
ICBTAG_FLAG_AD_LONG)
- epos.offset -= sizeof(long_ad);
+ epos.offset -= sizeof(struct long_ad);
} else {
offset = 0;
}
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
for (num = 0; i > 0; i--) {
- block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i);
+ block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
tmp = udf_tgetblk(dir->i_sb, block);
if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
memcpy(fname, "..", flen);
dt_type = DT_DIR;
} else {
- kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
+ struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
- iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0);
+ iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
dt_type = DT_UNKNOWN;
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 2820f8fcf4c..1d2c570704c 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -20,7 +20,7 @@
#if 0
static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
- uint8_t ad_size, kernel_lb_addr fe_loc,
+ uint8_t ad_size, struct kernel_lb_addr fe_loc,
int *pos, int *offset, struct buffer_head **bh,
int *error)
{
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
struct udf_fileident_bh *fibh,
struct fileIdentDesc *cfi,
struct extent_position *epos,
- kernel_lb_addr *eloc, uint32_t *elen,
+ struct kernel_lb_addr *eloc, uint32_t *elen,
sector_t *offset)
{
struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
(EXT_RECORDED_ALLOCATED >> 30))
return NULL;
- block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+ block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
(*offset)++;
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
if (i + *offset > (*elen >> blocksize_bits))
i = (*elen >> blocksize_bits)-*offset;
for (num = 0; i > 0; i--) {
- block = udf_get_lb_pblock(dir->i_sb, *eloc,
+ block = udf_get_lb_pblock(dir->i_sb, eloc,
*offset + i);
tmp = udf_tgetblk(dir->i_sb, block);
if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
(EXT_RECORDED_ALLOCATED >> 30))
return NULL;
- block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+ block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
(*offset)++;
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
}
#if 0
-static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
+static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
{
- extent_ad *ext;
+ struct extent_ad *ext;
struct fileEntry *fe;
uint8_t *ptr;
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
ptr += *offset;
- ext = (extent_ad *)ptr;
+ ext = (struct extent_ad *)ptr;
- *offset = *offset + sizeof(extent_ad);
+ *offset = *offset + sizeof(struct extent_ad);
return ext;
}
#endif
-short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
int inc)
{
- short_ad *sa;
+ struct short_ad *sa;
if ((!ptr) || (!offset)) {
printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
return NULL;
}
- if ((*offset + sizeof(short_ad)) > maxoffset)
+ if ((*offset + sizeof(struct short_ad)) > maxoffset)
return NULL;
else {
- sa = (short_ad *)ptr;
+ sa = (struct short_ad *)ptr;
if (sa->extLength == 0)
return NULL;
}
if (inc)
- *offset += sizeof(short_ad);
+ *offset += sizeof(struct short_ad);
return sa;
}
-long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
{
- long_ad *la;
+ struct long_ad *la;
if ((!ptr) || (!offset)) {
printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
return NULL;
}
- if ((*offset + sizeof(long_ad)) > maxoffset)
+ if ((*offset + sizeof(struct long_ad)) > maxoffset)
return NULL;
else {
- la = (long_ad *)ptr;
+ la = (struct long_ad *)ptr;
if (la->extLength == 0)
return NULL;
}
if (inc)
- *offset += sizeof(long_ad);
+ *offset += sizeof(struct long_ad);
return la;
}
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index a0974df82b3..4792b771aa8 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -38,10 +38,10 @@
#define _ECMA_167_H 1
/* Character set specification (ECMA 167r3 1/7.2.1) */
-typedef struct {
+struct charspec {
uint8_t charSetType;
uint8_t charSetInfo[63];
-} __attribute__ ((packed)) charspec;
+} __attribute__ ((packed));
/* Character Set Type (ECMA 167r3 1/7.2.1.1) */
#define CHARSPEC_TYPE_CS0 0x00 /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
typedef uint8_t dstring;
/* Timestamp (ECMA 167r3 1/7.3) */
-typedef struct {
+struct timestamp {
__le16 typeAndTimezone;
__le16 year;
uint8_t month;
@@ -68,7 +68,7 @@ typedef struct {
uint8_t centiseconds;
uint8_t hundredsOfMicroseconds;
uint8_t microseconds;
-} __attribute__ ((packed)) timestamp;
+} __attribute__ ((packed));
/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
#define TIMESTAMP_TYPE_MASK 0xF000
@@ -78,11 +78,11 @@ typedef struct {
#define TIMESTAMP_TIMEZONE_MASK 0x0FFF
/* Entity identifier (ECMA 167r3 1/7.4) */
-typedef struct {
+struct regid {
uint8_t flags;
uint8_t ident[23];
uint8_t identSuffix[8];
-} __attribute__ ((packed)) regid;
+} __attribute__ ((packed));
/* Flags (ECMA 167r3 1/7.4.1) */
#define ENTITYID_FLAGS_DIRTY 0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
/* Boot Descriptor (ECMA 167r3 2/9.4) */
struct bootDesc {
- uint8_t structType;
- uint8_t stdIdent[VSD_STD_ID_LEN];
- uint8_t structVersion;
- uint8_t reserved1;
- regid archType;
- regid bootIdent;
- __le32 bootExtLocation;
- __le32 bootExtLength;
- __le64 loadAddress;
- __le64 startAddress;
- timestamp descCreationDateAndTime;
- __le16 flags;
- uint8_t reserved2[32];
- uint8_t bootUse[1906];
+ uint8_t structType;
+ uint8_t stdIdent[VSD_STD_ID_LEN];
+ uint8_t structVersion;
+ uint8_t reserved1;
+ struct regid archType;
+ struct regid bootIdent;
+ __le32 bootExtLocation;
+ __le32 bootExtLength;
+ __le64 loadAddress;
+ __le64 startAddress;
+ struct timestamp descCreationDateAndTime;
+ __le16 flags;
+ uint8_t reserved2[32];
+ uint8_t bootUse[1906];
} __attribute__ ((packed));
/* Flags (ECMA 167r3 2/9.4.12) */
#define BOOT_FLAGS_ERASE 0x01
/* Extent Descriptor (ECMA 167r3 3/7.1) */
-typedef struct {
+struct extent_ad {
__le32 extLength;
__le32 extLocation;
-} __attribute__ ((packed)) extent_ad;
+} __attribute__ ((packed));
-typedef struct {
+struct kernel_extent_ad {
uint32_t extLength;
uint32_t extLocation;
-} kernel_extent_ad;
+};
/* Descriptor Tag (ECMA 167r3 3/7.2) */
-typedef struct {
+struct tag {
__le16 tagIdent;
__le16 descVersion;
uint8_t tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
__le16 descCRC;
__le16 descCRCLength;
__le32 tagLocation;
-} __attribute__ ((packed)) tag;
+} __attribute__ ((packed));
/* Tag Identifier (ECMA 167r3 3/7.2.1) */
#define TAG_IDENT_PVD 0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
struct primaryVolDesc {
- tag descTag;
- __le32 volDescSeqNum;
- __le32 primaryVolDescNum;
- dstring volIdent[32];
- __le16 volSeqNum;
- __le16 maxVolSeqNum;
- __le16 interchangeLvl;
- __le16 maxInterchangeLvl;
- __le32 charSetList;
- __le32 maxCharSetList;
- dstring volSetIdent[128];
- charspec descCharSet;
- charspec explanatoryCharSet;
- extent_ad volAbstract;
- extent_ad volCopyright;
- regid appIdent;
- timestamp recordingDateAndTime;
- regid impIdent;
- uint8_t impUse[64];
- __le32 predecessorVolDescSeqLocation;
- __le16 flags;
- uint8_t reserved[22];
+ struct tag descTag;
+ __le32 volDescSeqNum;
+ __le32 primaryVolDescNum;
+ dstring volIdent[32];
+ __le16 volSeqNum;
+ __le16 maxVolSeqNum;
+ __le16 interchangeLvl;
+ __le16 maxInterchangeLvl;
+ __le32 charSetList;
+ __le32 maxCharSetList;
+ dstring volSetIdent[128];
+ struct charspec descCharSet;
+ struct charspec explanatoryCharSet;
+ struct extent_ad volAbstract;
+ struct extent_ad volCopyright;
+ struct regid appIdent;
+ struct timestamp recordingDateAndTime;
+ struct regid impIdent;
+ uint8_t impUse[64];
+ __le32 predecessorVolDescSeqLocation;
+ __le16 flags;
+ uint8_t reserved[22];
} __attribute__ ((packed));
/* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
/* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
struct anchorVolDescPtr {
- tag descTag;
- extent_ad mainVolDescSeqExt;
- extent_ad reserveVolDescSeqExt;
- uint8_t reserved[480];
+ struct tag descTag;
+ struct extent_ad mainVolDescSeqExt;
+ struct extent_ad reserveVolDescSeqExt;
+ uint8_t reserved[480];
} __attribute__ ((packed));
/* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
struct volDescPtr {
- tag descTag;
- __le32 volDescSeqNum;
- extent_ad nextVolDescSeqExt;
- uint8_t reserved[484];
+ struct tag descTag;
+ __le32 volDescSeqNum;
+ struct extent_ad nextVolDescSeqExt;
+ uint8_t reserved[484];
} __attribute__ ((packed));
/* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
struct impUseVolDesc {
- tag descTag;
+ struct tag descTag;
__le32 volDescSeqNum;
- regid impIdent;
+ struct regid impIdent;
uint8_t impUse[460];
} __attribute__ ((packed));
/* Partition Descriptor (ECMA 167r3 3/10.5) */
struct partitionDesc {
- tag descTag;
+ struct tag descTag;
__le32 volDescSeqNum;
__le16 partitionFlags;
__le16 partitionNumber;
- regid partitionContents;
+ struct regid partitionContents;
uint8_t partitionContentsUse[128];
__le32 accessType;
__le32 partitionStartingLocation;
__le32 partitionLength;
- regid impIdent;
+ struct regid impIdent;
uint8_t impUse[128];
uint8_t reserved[156];
} __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
/* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
struct logicalVolDesc {
- tag descTag;
- __le32 volDescSeqNum;
- charspec descCharSet;
- dstring logicalVolIdent[128];
- __le32 logicalBlockSize;
- regid domainIdent;
- uint8_t logicalVolContentsUse[16];
- __le32 mapTableLength;
- __le32 numPartitionMaps;
- regid impIdent;
- uint8_t impUse[128];
- extent_ad integritySeqExt;
- uint8_t partitionMaps[0];
+ struct tag descTag;
+ __le32 volDescSeqNum;
+ struct charspec descCharSet;
+ dstring logicalVolIdent[128];
+ __le32 logicalBlockSize;
+ struct regid domainIdent;
+ uint8_t logicalVolContentsUse[16];
+ __le32 mapTableLength;
+ __le32 numPartitionMaps;
+ struct regid impIdent;
+ uint8_t impUse[128];
+ struct extent_ad integritySeqExt;
+ uint8_t partitionMaps[0];
} __attribute__ ((packed));
/* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
/* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
struct unallocSpaceDesc {
- tag descTag;
- __le32 volDescSeqNum;
- __le32 numAllocDescs;
- extent_ad allocDescs[0];
+ struct tag descTag;
+ __le32 volDescSeqNum;
+ __le32 numAllocDescs;
+ struct extent_ad allocDescs[0];
} __attribute__ ((packed));
/* Terminating Descriptor (ECMA 167r3 3/10.9) */
struct terminatingDesc {
- tag descTag;
+ struct tag descTag;
uint8_t reserved[496];
} __attribute__ ((packed));
/* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
struct logicalVolIntegrityDesc {
- tag descTag;
- timestamp recordingDateAndTime;
- __le32 integrityType;
- extent_ad nextIntegrityExt;
- uint8_t logicalVolContentsUse[32];
- __le32 numOfPartitions;
- __le32 lengthOfImpUse;
- __le32 freeSpaceTable[0];
- __le32 sizeTable[0];
- uint8_t impUse[0];
+ struct tag descTag;
+ struct timestamp recordingDateAndTime;
+ __le32 integrityType;
+ struct extent_ad nextIntegrityExt;
+ uint8_t logicalVolContentsUse[32];
+ __le32 numOfPartitions;
+ __le32 lengthOfImpUse;
+ __le32 freeSpaceTable[0];
+ __le32 sizeTable[0];
+ uint8_t impUse[0];
} __attribute__ ((packed));
/* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
#define LVID_INTEGRITY_TYPE_CLOSE 0x00000001
/* Recorded Address (ECMA 167r3 4/7.1) */
-typedef struct {
+struct lb_addr {
__le32 logicalBlockNum;
__le16 partitionReferenceNum;
-} __attribute__ ((packed)) lb_addr;
+} __attribute__ ((packed));
/* ... and its in-core analog */
-typedef struct {
+struct kernel_lb_addr {
uint32_t logicalBlockNum;
uint16_t partitionReferenceNum;
-} kernel_lb_addr;
+};
/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
-typedef struct {
+struct short_ad {
__le32 extLength;
__le32 extPosition;
-} __attribute__ ((packed)) short_ad;
+} __attribute__ ((packed));
/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
-typedef struct {
+struct long_ad {
__le32 extLength;
- lb_addr extLocation;
+ struct lb_addr extLocation;
uint8_t impUse[6];
-} __attribute__ ((packed)) long_ad;
+} __attribute__ ((packed));
-typedef struct {
- uint32_t extLength;
- kernel_lb_addr extLocation;
- uint8_t impUse[6];
-} kernel_long_ad;
+struct kernel_long_ad {
+ uint32_t extLength;
+ struct kernel_lb_addr extLocation;
+ uint8_t impUse[6];
+};
/* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
-typedef struct {
+struct ext_ad {
__le32 extLength;
__le32 recordedLength;
__le32 informationLength;
- lb_addr extLocation;
-} __attribute__ ((packed)) ext_ad;
+ struct lb_addr extLocation;
+} __attribute__ ((packed));
-typedef struct {
- uint32_t extLength;
- uint32_t recordedLength;
- uint32_t informationLength;
- kernel_lb_addr extLocation;
-} kernel_ext_ad;
+struct kernel_ext_ad {
+ uint32_t extLength;
+ uint32_t recordedLength;
+ uint32_t informationLength;
+ struct kernel_lb_addr extLocation;
+};
/* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
@@ -415,44 +415,44 @@ typedef struct {
/* File Set Descriptor (ECMA 167r3 4/14.1) */
struct fileSetDesc {
- tag descTag;
- timestamp recordingDateAndTime;
- __le16 interchangeLvl;
- __le16 maxInterchangeLvl;
- __le32 charSetList;
- __le32 maxCharSetList;
- __le32 fileSetNum;
- __le32 fileSetDescNum;
- charspec logicalVolIdentCharSet;
- dstring logicalVolIdent[128];
- charspec fileSetCharSet;
- dstring fileSetIdent[32];
- dstring copyrightFileIdent[32];
- dstring abstractFileIdent[32];
- long_ad rootDirectoryICB;
- regid domainIdent;
- long_ad nextExt;
- long_ad streamDirectoryICB;
- uint8_t reserved[32];
+ struct tag descTag;
+ struct timestamp recordingDateAndTime;
+ __le16 interchangeLvl;
+ __le16 maxInterchangeLvl;
+ __le32 charSetList;
+ __le32 maxCharSetList;
+ __le32 fileSetNum;
+ __le32 fileSetDescNum;
+ struct charspec logicalVolIdentCharSet;
+ dstring logicalVolIdent[128];
+ struct charspec fileSetCharSet;
+ dstring fileSetIdent[32];
+ dstring copyrightFileIdent[32];
+ dstring abstractFileIdent[32];
+ struct long_ad rootDirectoryICB;
+ struct regid domainIdent;
+ struct long_ad nextExt;
+ struct long_ad streamDirectoryICB;
+ uint8_t reserved[32];
} __attribute__ ((packed));
/* Partition Header Descriptor (ECMA 167r3 4/14.3) */
struct partitionHeaderDesc {
- short_ad unallocSpaceTable;
- short_ad unallocSpaceBitmap;
- short_ad partitionIntegrityTable;
- short_ad freedSpaceTable;
- short_ad freedSpaceBitmap;
+ struct short_ad unallocSpaceTable;
+ struct short_ad unallocSpaceBitmap;
+ struct short_ad partitionIntegrityTable;
+ struct short_ad freedSpaceTable;
+ struct short_ad freedSpaceBitmap;
uint8_t reserved[88];
} __attribute__ ((packed));
/* File Identifier Descriptor (ECMA 167r3 4/14.4) */
struct fileIdentDesc {
- tag descTag;
+ struct tag descTag;
__le16 fileVersionNum;
uint8_t fileCharacteristics;
uint8_t lengthFileIdent;
- long_ad icb;
+ struct long_ad icb;
__le16 lengthOfImpUse;
uint8_t impUse[0];
uint8_t fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
/* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
struct allocExtDesc {
- tag descTag;
+ struct tag descTag;
__le32 previousAllocExtLocation;
__le32 lengthAllocDescs;
} __attribute__ ((packed));
/* ICB Tag (ECMA 167r3 4/14.6) */
-typedef struct {
+struct icbtag {
__le32 priorRecordedNumDirectEntries;
__le16 strategyType;
__le16 strategyParameter;
__le16 numEntries;
uint8_t reserved;
uint8_t fileType;
- lb_addr parentICBLocation;
+ struct lb_addr parentICBLocation;
__le16 flags;
-} __attribute__ ((packed)) icbtag;
+} __attribute__ ((packed));
/* Strategy Type (ECMA 167r3 4/14.6.2) */
#define ICBTAG_STRATEGY_TYPE_UNDEF 0x0000
@@ -528,41 +528,41 @@ typedef struct {
/* Indirect Entry (ECMA 167r3 4/14.7) */
struct indirectEntry {
- tag descTag;
- icbtag icbTag;
- long_ad indirectICB;
+ struct tag descTag;
+ struct icbtag icbTag;
+ struct long_ad indirectICB;
} __attribute__ ((packed));
/* Terminal Entry (ECMA 167r3 4/14.8) */
struct terminalEntry {
- tag descTag;
- icbtag icbTag;
+ struct tag descTag;
+ struct icbtag icbTag;
} __attribute__ ((packed));
/* File Entry (ECMA 167r3 4/14.9) */
struct fileEntry {
- tag descTag;
- icbtag icbTag;
- __le32 uid;
- __le32 gid;
- __le32 permissions;
- __le16 fileLinkCount;
- uint8_t recordFormat;
- uint8_t recordDisplayAttr;
- __le32 recordLength;
- __le64 informationLength;
- __le64 logicalBlocksRecorded;
- timestamp accessTime;
- timestamp modificationTime;
- timestamp attrTime;
- __le32 checkpoint;
- long_ad extendedAttrICB;
- regid impIdent;
- __le64 uniqueID;
- __le32 lengthExtendedAttr;
- __le32 lengthAllocDescs;
- uint8_t extendedAttr[0];
- uint8_t allocDescs[0];
+ struct tag descTag;
+ struct icbtag icbTag;
+ __le32 uid;
+ __le32 gid;
+ __le32 permissions;
+ __le16 fileLinkCount;
+ uint8_t recordFormat;
+ uint8_t recordDisplayAttr;
+ __le32 recordLength;
+ __le64 informationLength;
+ __le64 logicalBlocksRecorded;
+ struct timestamp accessTime;
+ struct timestamp modificationTime;
+ struct timestamp attrTime;
+ __le32 checkpoint;
+ struct long_ad extendedAttrICB;
+ struct regid impIdent;
+ __le64 uniqueID;
+ __le32 lengthExtendedAttr;
+ __le32 lengthAllocDescs;
+ uint8_t extendedAttr[0];
+ uint8_t allocDescs[0];
} __attribute__ ((packed));
/* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
/* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
struct extendedAttrHeaderDesc {
- tag descTag;
+ struct tag descTag;
__le32 impAttrLocation;
__le32 appAttrLocation;
} __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
uint8_t reserved[3];
__le32 attrLength;
__le32 impUseLength;
- regid impIdent;
+ struct regid impIdent;
uint8_t impUse[0];
} __attribute__ ((packed));
@@ -698,7 +698,7 @@ struct appUseExtAttr {
uint8_t reserved[3];
__le32 attrLength;
__le32 appUseLength;
- regid appIdent;
+ struct regid appIdent;
uint8_t appUse[0];
} __attribute__ ((packed));
@@ -712,15 +712,15 @@ struct appUseExtAttr {
/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
struct unallocSpaceEntry {
- tag descTag;
- icbtag icbTag;
+ struct tag descTag;
+ struct icbtag icbTag;
__le32 lengthAllocDescs;
uint8_t allocDescs[0];
} __attribute__ ((packed));
/* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
struct spaceBitmapDesc {
- tag descTag;
+ struct tag descTag;
__le32 numOfBits;
__le32 numOfBytes;
uint8_t bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
/* Partition Integrity Entry (ECMA 167r3 4/14.13) */
struct partitionIntegrityEntry {
- tag descTag;
- icbtag icbTag;
- timestamp recordingDateAndTime;
- uint8_t integrityType;
- uint8_t reserved[175];
- regid impIdent;
- uint8_t impUse[256];
+ struct tag descTag;
+ struct icbtag icbTag;
+ struct timestamp recordingDateAndTime;
+ uint8_t integrityType;
+ uint8_t reserved[175];
+ struct regid impIdent;
+ uint8_t impUse[256];
} __attribute__ ((packed));
/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
/* File Entry (ECMA 167r3 4/14.17) */
struct extendedFileEntry {
- tag descTag;
- icbtag icbTag;
- __le32 uid;
- __le32 gid;
- __le32 permissions;
- __le16 fileLinkCount;
- uint8_t recordFormat;
- uint8_t recordDisplayAttr;
- __le32 recordLength;
- __le64 informationLength;
- __le64 objectSize;
- __le64 logicalBlocksRecorded;
- timestamp accessTime;
- timestamp modificationTime;
- timestamp createTime;
- timestamp attrTime;
- __le32 checkpoint;
- __le32 reserved;
- long_ad extendedAttrICB;
- long_ad streamDirectoryICB;
- regid impIdent;
- __le64 uniqueID;
- __le32 lengthExtendedAttr;
- __le32 lengthAllocDescs;
- uint8_t extendedAttr[0];
- uint8_t allocDescs[0];
+ struct tag descTag;
+ struct icbtag icbTag;
+ __le32 uid;
+ __le32 gid;
+ __le32 permissions;
+ __le16 fileLinkCount;
+ uint8_t recordFormat;
+ uint8_t recordDisplayAttr;
+ __le32 recordLength;
+ __le64 informationLength;
+ __le64 objectSize;
+ __le64 logicalBlocksRecorded;
+ struct timestamp accessTime;
+ struct timestamp modificationTime;
+ struct timestamp createTime;
+ struct timestamp attrTime;
+ __le32 checkpoint;
+ __le32 reserved;
+ struct long_ad extendedAttrICB;
+ struct long_ad streamDirectoryICB;
+ struct regid impIdent;
+ __le64 uniqueID;
+ __le32 lengthExtendedAttr;
+ __le32 lengthAllocDescs;
+ uint8_t extendedAttr[0];
+ uint8_t allocDescs[0];
} __attribute__ ((packed));
#endif /* _ECMA_167_H */
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 47dbe5613f9..c10fa39f97e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
le32_add_cpu(&lvidiu->numDirs, -1);
else
le32_add_cpu(&lvidiu->numFiles, -1);
-
- mark_buffer_dirty(sbi->s_lvid_bh);
+ udf_updated_lvid(sb);
}
mutex_unlock(&sbi->s_alloc_mutex);
- udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1);
+ udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
}
struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
if (!(++uniqueID & 0x00000000FFFFFFFFUL))
uniqueID += 16;
lvhd->uniqueID = cpu_to_le64(uniqueID);
- mark_buffer_dirty(sbi->s_lvid_bh);
+ udf_updated_lvid(sb);
}
mutex_unlock(&sbi->s_alloc_mutex);
inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
iinfo->i_location.logicalBlockNum = block;
iinfo->i_location.partitionReferenceNum =
dinfo->i_location.partitionReferenceNum;
- inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0);
+ inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
inode->i_blocks = 0;
iinfo->i_lenEAttr = 0;
iinfo->i_lenAlloc = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 30ebde490f7..e7533f78563 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
sector_t *, int *);
static int8_t udf_insert_aext(struct inode *, struct extent_position,
- kernel_lb_addr, uint32_t);
+ struct kernel_lb_addr, uint32_t);
static void udf_split_extents(struct inode *, int *, int, int,
- kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+ struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
static void udf_prealloc_extents(struct inode *, int, int,
- kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+ struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
static void udf_merge_extents(struct inode *,
- kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+ struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
static void udf_update_extents(struct inode *,
- kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+ struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
struct extent_position *);
static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
{
int newblock;
struct buffer_head *dbh = NULL;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
uint8_t alloctype;
struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
epos.bh = NULL;
epos.block = iinfo->i_location;
epos.offset = udf_file_entry_alloc_offset(inode);
- udf_add_aext(inode, &epos, eloc, elen, 0);
+ udf_add_aext(inode, &epos, &eloc, elen, 0);
/* UniqueID stuff */
brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
/* Extend the file by 'blocks' blocks, return the number of extents added */
int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
- kernel_long_ad *last_ext, sector_t blocks)
+ struct kernel_long_ad *last_ext, sector_t blocks)
{
sector_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
struct super_block *sb = inode->i_sb;
- kernel_lb_addr prealloc_loc = {};
+ struct kernel_lb_addr prealloc_loc = {};
int prealloc_len = 0;
struct udf_inode_info *iinfo;
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
}
if (fake) {
- udf_add_aext(inode, last_pos, last_ext->extLocation,
+ udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
count++;
} else
- udf_write_aext(inode, last_pos, last_ext->extLocation,
+ udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
/* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
/* Create enough extents to cover the whole hole */
while (blocks > add) {
blocks -= add;
- if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+ if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1) == -1)
return -1;
count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
if (blocks) {
last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
(blocks << sb->s_blocksize_bits);
- if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+ if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1) == -1)
return -1;
count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
out:
/* Do we have some preallocated blocks saved? */
if (prealloc_len) {
- if (udf_add_aext(inode, last_pos, prealloc_loc,
+ if (udf_add_aext(inode, last_pos, &prealloc_loc,
prealloc_len, 1) == -1)
return -1;
last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
/* last_pos should point to the last written extent... */
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- last_pos->offset -= sizeof(short_ad);
+ last_pos->offset -= sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- last_pos->offset -= sizeof(long_ad);
+ last_pos->offset -= sizeof(struct long_ad);
else
return -1;
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
{
static sector_t last_block;
struct buffer_head *result = NULL;
- kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+ struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
struct extent_position prev_epos, cur_epos, next_epos;
int count = 0, startnum = 0, endnum = 0;
uint32_t elen = 0, tmpelen;
- kernel_lb_addr eloc, tmpeloc;
+ struct kernel_lb_addr eloc, tmpeloc;
int c = 1;
loff_t lbcount = 0, b_off = 0;
uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
elen = EXT_RECORDED_ALLOCATED |
((elen + inode->i_sb->s_blocksize - 1) &
~(inode->i_sb->s_blocksize - 1));
- etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1);
+ etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
}
brelse(prev_epos.bh);
brelse(cur_epos.bh);
brelse(next_epos.bh);
- newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+ newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
*phys = newblock;
return NULL;
}
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
} else {
/* Create a fake extent when there's not one */
memset(&laarr[0].extLocation, 0x00,
- sizeof(kernel_lb_addr));
+ sizeof(struct kernel_lb_addr));
laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
/* Will udf_extend_file() create real extent from
a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
inode->i_sb->s_blocksize;
memset(&laarr[c].extLocation, 0x00,
- sizeof(kernel_lb_addr));
+ sizeof(struct kernel_lb_addr));
count++;
endnum++;
}
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
static void udf_split_extents(struct inode *inode, int *c, int offset,
int newblocknum,
- kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+ struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
int *endnum)
{
unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
if (offset) {
if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
udf_free_blocks(inode->i_sb, inode,
- laarr[curr].extLocation,
+ &laarr[curr].extLocation,
0, offset);
laarr[curr].extLength =
EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
}
static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
- kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+ struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
int *endnum)
{
int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
inode->i_sb->s_blocksize_bits);
else {
memmove(&laarr[c + 2], &laarr[c + 1],
- sizeof(long_ad) * (*endnum - (c + 1)));
+ sizeof(struct long_ad) * (*endnum - (c + 1)));
(*endnum)++;
laarr[c + 1].extLocation.logicalBlockNum = next;
laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
if (*endnum > (i + 1))
memmove(&laarr[i],
&laarr[i + 1],
- sizeof(long_ad) *
+ sizeof(struct long_ad) *
(*endnum - (i + 1)));
i--;
(*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
}
static void udf_merge_extents(struct inode *inode,
- kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+ struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
int *endnum)
{
int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
for (i = 0; i < (*endnum - 1); i++) {
- kernel_long_ad *li /*l[i]*/ = &laarr[i];
- kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+ struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
+ struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
(((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
blocksize - 1) & ~(blocksize - 1));
if (*endnum > (i + 2))
memmove(&laarr[i + 1], &laarr[i + 2],
- sizeof(long_ad) *
+ sizeof(struct long_ad) *
(*endnum - (i + 2)));
i--;
(*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
(EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
((lip1->extLength >> 30) ==
(EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
- udf_free_blocks(inode->i_sb, inode, li->extLocation, 0,
+ udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
((li->extLength &
UDF_EXTENT_LENGTH_MASK) +
blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
blocksize - 1) & ~(blocksize - 1));
if (*endnum > (i + 2))
memmove(&laarr[i + 1], &laarr[i + 2],
- sizeof(long_ad) *
+ sizeof(struct long_ad) *
(*endnum - (i + 2)));
i--;
(*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
} else if ((li->extLength >> 30) ==
(EXT_NOT_RECORDED_ALLOCATED >> 30)) {
udf_free_blocks(inode->i_sb, inode,
- li->extLocation, 0,
+ &li->extLocation, 0,
((li->extLength &
UDF_EXTENT_LENGTH_MASK) +
blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
}
static void udf_update_extents(struct inode *inode,
- kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+ struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
int startnum, int endnum,
struct extent_position *epos)
{
int start = 0, i;
- kernel_lb_addr tmploc;
+ struct kernel_lb_addr tmploc;
uint32_t tmplen;
if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
for (i = start; i < endnum; i++) {
udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
- udf_write_aext(inode, epos, laarr[i].extLocation,
+ udf_write_aext(inode, epos, &laarr[i].extLocation,
laarr[i].extLength, 1);
}
}
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
* i_nlink = 1
* i_op = NULL;
*/
- bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident);
+ bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
if (!bh) {
printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
struct buffer_head *ibh;
- ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
+ ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
&ident);
if (ident == TAG_IDENT_IE && ibh) {
struct buffer_head *nbh = NULL;
- kernel_lb_addr loc;
+ struct kernel_lb_addr loc;
struct indirectEntry *ie;
ie = (struct indirectEntry *)ibh->b_data;
loc = lelb_to_cpu(ie->indirectICB.extLocation);
if (ie->indirectICB.extLength &&
- (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+ (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
&ident))) {
if (ident == TAG_IDENT_FE ||
ident == TAG_IDENT_EFE) {
memcpy(&iinfo->i_location,
&loc,
- sizeof(kernel_lb_addr));
+ sizeof(struct kernel_lb_addr));
brelse(bh);
brelse(ibh);
brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
inode->i_size = le64_to_cpu(fe->informationLength);
iinfo->i_lenExtents = inode->i_size;
- inode->i_mode = udf_convert_permissions(fe);
- inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask;
+ if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
+ sbi->s_fmode != UDF_INVALID_MODE)
+ inode->i_mode = sbi->s_fmode;
+ else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+ sbi->s_dmode != UDF_INVALID_MODE)
+ inode->i_mode = sbi->s_dmode;
+ else
+ inode->i_mode = udf_convert_permissions(fe);
+ inode->i_mode &= ~sbi->s_umask;
if (iinfo->i_efe == 0) {
inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
bh = udf_tread(inode->i_sb,
udf_get_lb_pblock(inode->i_sb,
- iinfo->i_location, 0));
+ &iinfo->i_location, 0));
if (!bh) {
udf_debug("bread failure\n");
return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
sizeof(struct unallocSpaceEntry));
crclen = sizeof(struct unallocSpaceEntry) +
- iinfo->i_lenAlloc - sizeof(tag);
+ iinfo->i_lenAlloc - sizeof(struct tag);
use->descTag.tagLocation = cpu_to_le32(
iinfo->i_location.
logicalBlockNum);
use->descTag.descCRCLength = cpu_to_le16(crclen);
use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
- sizeof(tag),
+ sizeof(struct tag),
crclen));
use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
fe->informationLength = cpu_to_le64(inode->i_size);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- regid *eid;
+ struct regid *eid;
struct deviceSpec *dsea =
(struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
if (!dsea) {
dsea = (struct deviceSpec *)
udf_add_extendedattr(inode,
sizeof(struct deviceSpec) +
- sizeof(regid), 12, 0x3);
+ sizeof(struct regid), 12, 0x3);
dsea->attrType = cpu_to_le32(12);
dsea->attrSubtype = 1;
dsea->attrLength = cpu_to_le32(
sizeof(struct deviceSpec) +
- sizeof(regid));
- dsea->impUseLength = cpu_to_le32(sizeof(regid));
+ sizeof(struct regid));
+ dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
}
- eid = (regid *)dsea->impUse;
- memset(eid, 0, sizeof(regid));
+ eid = (struct regid *)dsea->impUse;
+ memset(eid, 0, sizeof(struct regid));
strcpy(eid->ident, UDF_ID_DEVELOPER);
eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
- memset(&(fe->impIdent), 0, sizeof(regid));
+ memset(&(fe->impIdent), 0, sizeof(struct regid));
strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
- memset(&(efe->impIdent), 0, sizeof(regid));
+ memset(&(efe->impIdent), 0, sizeof(struct regid));
strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
fe->descTag.tagLocation = cpu_to_le32(
iinfo->i_location.logicalBlockNum);
crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
- sizeof(tag);
+ sizeof(struct tag);
fe->descTag.descCRCLength = cpu_to_le16(crclen);
- fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+ fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
crclen));
fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
return err;
}
-struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
+struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
{
unsigned long block = udf_get_lb_pblock(sb, ino, 0);
struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
return NULL;
if (inode->i_state & I_NEW) {
- memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr));
+ memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
__udf_read_inode(inode);
unlock_new_inode(inode);
}
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
if (is_bad_inode(inode))
goto out_iput;
- if (ino.logicalBlockNum >= UDF_SB(sb)->
- s_partmaps[ino.partitionReferenceNum].s_partition_len) {
+ if (ino->logicalBlockNum >= UDF_SB(sb)->
+ s_partmaps[ino->partitionReferenceNum].s_partition_len) {
udf_debug("block=%d, partition=%d out of range\n",
- ino.logicalBlockNum, ino.partitionReferenceNum);
+ ino->logicalBlockNum, ino->partitionReferenceNum);
make_bad_inode(inode);
goto out_iput;
}
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
}
int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
- kernel_lb_addr eloc, uint32_t elen, int inc)
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
int adsize;
- short_ad *sad = NULL;
- long_ad *lad = NULL;
+ struct short_ad *sad = NULL;
+ struct long_ad *lad = NULL;
struct allocExtDesc *aed;
int8_t etype;
uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
ptr = epos->bh->b_data + epos->offset;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
return -1;
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
char *sptr, *dptr;
struct buffer_head *nbh;
int err, loffset;
- kernel_lb_addr obloc = epos->block;
+ struct kernel_lb_addr obloc = epos->block;
epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
if (!epos->block.logicalBlockNum)
return -1;
nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
- epos->block,
+ &epos->block,
0));
if (!nbh)
return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
}
if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
- epos->block.logicalBlockNum, sizeof(tag));
+ epos->block.logicalBlockNum, sizeof(struct tag));
else
udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
- epos->block.logicalBlockNum, sizeof(tag));
+ epos->block.logicalBlockNum, sizeof(struct tag));
switch (iinfo->i_alloc_type) {
case ICBTAG_FLAG_AD_SHORT:
- sad = (short_ad *)sptr;
+ sad = (struct short_ad *)sptr;
sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
inode->i_sb->s_blocksize);
sad->extPosition =
cpu_to_le32(epos->block.logicalBlockNum);
break;
case ICBTAG_FLAG_AD_LONG:
- lad = (long_ad *)sptr;
+ lad = (struct long_ad *)sptr;
lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
inode->i_sb->s_blocksize);
lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
}
int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
- kernel_lb_addr eloc, uint32_t elen, int inc)
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
int adsize;
uint8_t *ptr;
- short_ad *sad;
- long_ad *lad;
+ struct short_ad *sad;
+ struct long_ad *lad;
struct udf_inode_info *iinfo = UDF_I(inode);
if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
switch (iinfo->i_alloc_type) {
case ICBTAG_FLAG_AD_SHORT:
- sad = (short_ad *)ptr;
+ sad = (struct short_ad *)ptr;
sad->extLength = cpu_to_le32(elen);
- sad->extPosition = cpu_to_le32(eloc.logicalBlockNum);
- adsize = sizeof(short_ad);
+ sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
+ adsize = sizeof(struct short_ad);
break;
case ICBTAG_FLAG_AD_LONG:
- lad = (long_ad *)ptr;
+ lad = (struct long_ad *)ptr;
lad->extLength = cpu_to_le32(elen);
- lad->extLocation = cpu_to_lelb(eloc);
+ lad->extLocation = cpu_to_lelb(*eloc);
memset(lad->impUse, 0x00, sizeof(lad->impUse));
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
break;
default:
return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
}
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
- kernel_lb_addr *eloc, uint32_t *elen, int inc)
+ struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int8_t etype;
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
epos->block = *eloc;
epos->offset = sizeof(struct allocExtDesc);
brelse(epos->bh);
- block = udf_get_lb_pblock(inode->i_sb, epos->block, 0);
+ block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
epos->bh = udf_tread(inode->i_sb, block);
if (!epos->bh) {
udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
}
int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
- kernel_lb_addr *eloc, uint32_t *elen, int inc)
+ struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int alen;
int8_t etype;
uint8_t *ptr;
- short_ad *sad;
- long_ad *lad;
+ struct short_ad *sad;
+ struct long_ad *lad;
struct udf_inode_info *iinfo = UDF_I(inode);
if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
}
static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
- kernel_lb_addr neloc, uint32_t nelen)
+ struct kernel_lb_addr neloc, uint32_t nelen)
{
- kernel_lb_addr oeloc;
+ struct kernel_lb_addr oeloc;
uint32_t oelen;
int8_t etype;
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
get_bh(epos.bh);
while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
- udf_write_aext(inode, &epos, neloc, nelen, 1);
+ udf_write_aext(inode, &epos, &neloc, nelen, 1);
neloc = oeloc;
nelen = (etype << 30) | oelen;
}
- udf_add_aext(inode, &epos, neloc, nelen, 1);
+ udf_add_aext(inode, &epos, &neloc, nelen, 1);
brelse(epos.bh);
return (nelen >> 30);
}
int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
- kernel_lb_addr eloc, uint32_t elen)
+ struct kernel_lb_addr eloc, uint32_t elen)
{
struct extent_position oepos;
int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
iinfo = UDF_I(inode);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
adsize = 0;
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
return -1;
while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
- udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1);
+ udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
if (oepos.bh != epos.bh) {
oepos.block = epos.block;
brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
oepos.offset = epos.offset - adsize;
}
}
- memset(&eloc, 0x00, sizeof(kernel_lb_addr));
+ memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
elen = 0;
if (epos.bh != oepos.bh) {
- udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1);
- udf_write_aext(inode, &oepos, eloc, elen, 1);
- udf_write_aext(inode, &oepos, eloc, elen, 1);
+ udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
+ udf_write_aext(inode, &oepos, &eloc, elen, 1);
+ udf_write_aext(inode, &oepos, &eloc, elen, 1);
if (!oepos.bh) {
iinfo->i_lenAlloc -= (adsize * 2);
mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
mark_buffer_dirty_inode(oepos.bh, inode);
}
} else {
- udf_write_aext(inode, &oepos, eloc, elen, 1);
+ udf_write_aext(inode, &oepos, &eloc, elen, 1);
if (!oepos.bh) {
iinfo->i_lenAlloc -= adsize;
mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
}
int8_t inode_bmap(struct inode *inode, sector_t block,
- struct extent_position *pos, kernel_lb_addr *eloc,
+ struct extent_position *pos, struct kernel_lb_addr *eloc,
uint32_t *elen, sector_t *offset)
{
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
long udf_block_map(struct inode *inode, sector_t block)
{
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
sector_t offset;
struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
(EXT_RECORDED_ALLOCATED >> 30))
- ret = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+ ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
else
ret = 0;
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 84bf0fd4a4f..9215700c00a 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
}
}
/* rewrite CRC + checksum of eahd */
- crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
+ crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
eahd->descTag.descCRCLength = cpu_to_le16(crclen);
eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
- sizeof(tag), crclen));
+ sizeof(struct tag), crclen));
eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
iinfo->i_lenEAttr += size;
return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
uint32_t location, uint16_t *ident)
{
- tag *tag_p;
+ struct tag *tag_p;
struct buffer_head *bh = NULL;
/* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
return NULL;
}
- tag_p = (tag *)(bh->b_data);
+ tag_p = (struct tag *)(bh->b_data);
*ident = le16_to_cpu(tag_p->tagIdent);
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
}
/* Verify the descriptor CRC */
- if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
+ if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
- bh->b_data + sizeof(tag),
+ bh->b_data + sizeof(struct tag),
le16_to_cpu(tag_p->descCRCLength)))
return bh;
@@ -255,27 +255,28 @@ error_out:
return NULL;
}
-struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc,
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+ struct kernel_lb_addr *loc,
uint32_t offset, uint16_t *ident)
{
return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
- loc.logicalBlockNum + offset, ident);
+ loc->logicalBlockNum + offset, ident);
}
void udf_update_tag(char *data, int length)
{
- tag *tptr = (tag *)data;
- length -= sizeof(tag);
+ struct tag *tptr = (struct tag *)data;
+ length -= sizeof(struct tag);
tptr->descCRCLength = cpu_to_le16(length);
- tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
+ tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
tptr->tagChecksum = udf_tag_checksum(tptr);
}
void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
uint32_t loc, int length)
{
- tag *tptr = (tag *)data;
+ struct tag *tptr = (struct tag *)data;
tptr->tagIdent = cpu_to_le16(ident);
tptr->descVersion = cpu_to_le16(version);
tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
udf_update_tag(data, length);
}
-u8 udf_tag_checksum(const tag *t)
+u8 udf_tag_checksum(const struct tag *t)
{
u8 *data = (u8 *)t;
u8 checksum = 0;
int i;
- for (i = 0; i < sizeof(tag); ++i)
+ for (i = 0; i < sizeof(struct tag); ++i)
if (i != 4) /* position of checksum */
checksum += data[i];
return checksum;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f84bfaa8d94..6a29fa34c47 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
uint8_t *impuse, uint8_t *fileident)
{
- uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag);
+ uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
uint16_t crc;
int offset;
uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
- crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
- sizeof(struct fileIdentDesc) - sizeof(tag));
+ crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
+ sizeof(struct fileIdentDesc) - sizeof(struct tag));
if (fibh->sbh == fibh->ebh) {
crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
- crclen + sizeof(tag) -
+ crclen + sizeof(struct tag) -
sizeof(struct fileIdentDesc));
} else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
crc = crc_itu_t(crc, fibh->ebh->b_data +
sizeof(struct fileIdentDesc) +
fibh->soffset,
- crclen + sizeof(tag) -
+ crclen + sizeof(struct tag) -
sizeof(struct fileIdentDesc));
} else {
crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
uint8_t lfi;
uint16_t liu;
loff_t size;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
sector_t offset;
struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
&eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
goto out_err;
- block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+ block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- epos.offset -= sizeof(short_ad);
+ epos.offset -= sizeof(struct short_ad);
else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- epos.offset -= sizeof(long_ad);
+ epos.offset -= sizeof(struct long_ad);
} else
offset = 0;
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
#ifdef UDF_RECOVERY
/* temporary shorthand for specifying files by inode number */
if (!strncmp(dentry->d_name.name, ".B=", 3)) {
- kernel_lb_addr lb = {
+ struct kernel_lb_addr lb = {
.logicalBlockNum = 0,
.partitionReferenceNum =
simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
#endif /* UDF_RECOVERY */
if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+ struct kernel_lb_addr loc;
+
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
- inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation));
+ loc = lelb_to_cpu(cfi.icb.extLocation);
+ inode = udf_iget(dir->i_sb, &loc);
if (!inode) {
unlock_kernel();
return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
uint8_t lfi;
uint16_t liu;
int block;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen = 0;
sector_t offset;
struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
&eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
block = udf_get_lb_pblock(dir->i_sb,
- dinfo->i_location, 0);
+ &dinfo->i_location, 0);
fibh->soffset = fibh->eoffset = sb->s_blocksize;
goto add;
}
- block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+ block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- epos.offset -= sizeof(short_ad);
+ epos.offset -= sizeof(struct short_ad);
else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- epos.offset -= sizeof(long_ad);
+ epos.offset -= sizeof(struct long_ad);
} else
offset = 0;
@@ -409,10 +412,10 @@ add:
if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- epos.offset -= sizeof(short_ad);
+ epos.offset -= sizeof(struct short_ad);
else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- epos.offset -= sizeof(long_ad);
- udf_write_aext(dir, &epos, eloc, elen, 1);
+ epos.offset -= sizeof(struct long_ad);
+ udf_write_aext(dir, &epos, &eloc, elen, 1);
}
f_pos += nfidlen;
@@ -494,10 +497,10 @@ add:
memset(cfi, 0, sizeof(struct fileIdentDesc));
if (UDF_SB(sb)->s_udfrev >= 0x0200)
udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
- sizeof(tag));
+ sizeof(struct tag));
else
udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
- sizeof(tag));
+ sizeof(struct tag));
cfi->fileVersionNum = cpu_to_le16(1);
cfi->lengthFileIdent = namelen;
cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
- memset(&(cfi->icb), 0x00, sizeof(long_ad));
+ memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
}
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
loff_t f_pos;
loff_t size = udf_ext0_offset(dir) + dir->i_size;
int block;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
sector_t offset;
struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
&epos, &eloc, &elen, &offset) ==
(EXT_RECORDED_ALLOCATED >> 30)) {
- block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+ block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- epos.offset -= sizeof(short_ad);
+ epos.offset -= sizeof(struct short_ad);
else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- epos.offset -= sizeof(long_ad);
+ epos.offset -= sizeof(struct long_ad);
} else
offset = 0;
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct udf_fileident_bh fibh;
struct fileIdentDesc *fi, cfi;
- kernel_lb_addr tloc;
+ struct kernel_lb_addr tloc;
retval = -ENOENT;
lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
retval = -EIO;
tloc = lelb_to_cpu(cfi.icb.extLocation);
- if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+ if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
goto end_rmdir;
retval = -ENOTEMPTY;
if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
struct udf_fileident_bh fibh;
struct fileIdentDesc *fi;
struct fileIdentDesc cfi;
- kernel_lb_addr tloc;
+ struct kernel_lb_addr tloc;
retval = -ENOENT;
lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
retval = -EIO;
tloc = lelb_to_cpu(cfi.icb.extLocation);
- if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+ if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
goto end_unlink;
if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &page_symlink_inode_operations;
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t bsize;
block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
iinfo->i_location.partitionReferenceNum;
bsize = inode->i_sb->s_blocksize;
iinfo->i_lenExtents = bsize;
- udf_add_aext(inode, &epos, eloc, bsize, 0);
+ udf_add_aext(inode, &epos, &eloc, bsize, 0);
brelse(epos.bh);
block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct fileIdentDesc ocfi, ncfi;
struct buffer_head *dir_bh = NULL;
int retval = -ENOENT;
- kernel_lb_addr tloc;
+ struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
brelse(ofibh.sbh);
}
tloc = lelb_to_cpu(ocfi.icb.extLocation);
- if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0)
+ if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
!= old_inode->i_ino)
goto end_rename;
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!dir_fi)
goto end_rename;
tloc = lelb_to_cpu(dir_fi->icb.extLocation);
- if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) !=
+ if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
old_dir->i_ino)
goto end_rename;
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
ncfi.fileVersionNum = ocfi.fileVersionNum;
ncfi.fileCharacteristics = ocfi.fileCharacteristics;
- memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad));
+ memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
/* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
static struct dentry *udf_get_parent(struct dentry *child)
{
+ struct kernel_lb_addr tloc;
struct inode *inode = NULL;
struct qstr dotdot = {.name = "..", .len = 2};
struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
brelse(fibh.ebh);
brelse(fibh.sbh);
- inode = udf_iget(child->d_inode->i_sb,
- lelb_to_cpu(cfi.icb.extLocation));
+ tloc = lelb_to_cpu(cfi.icb.extLocation);
+ inode = udf_iget(child->d_inode->i_sb, &tloc);
if (!inode)
goto out_unlock;
unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
u16 partref, __u32 generation)
{
struct inode *inode;
- kernel_lb_addr loc;
+ struct kernel_lb_addr loc;
if (block == 0)
return ERR_PTR(-ESTALE);
loc.logicalBlockNum = block;
loc.partitionReferenceNum = partref;
- inode = udf_iget(sb, loc);
+ inode = udf_iget(sb, &loc);
if (inode == NULL)
return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
{
int len = *lenp;
struct inode *inode = de->d_inode;
- kernel_lb_addr location = UDF_I(inode)->i_location;
+ struct kernel_lb_addr location = UDF_I(inode)->i_location;
struct fid *fid = (struct fid *)fh;
int type = FILEID_UDF_WITHOUT_PARENT;
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index 65ff47902bd..fbff74654df 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -85,7 +85,7 @@ struct appIdentSuffix {
/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
/* Implementation Use (UDF 2.50 2.2.6.4) */
struct logicalVolIntegrityDescImpUse {
- regid impIdent;
+ struct regid impIdent;
__le32 numFiles;
__le32 numDirs;
__le16 minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
/* Implementation Use (UDF 2.50 2.2.7.2) */
struct impUseVolDescImpUse {
- charspec LVICharset;
+ struct charspec LVICharset;
dstring logicalVolIdent[128];
dstring LVInfo1[36];
dstring LVInfo2[36];
dstring LVInfo3[36];
- regid impIdent;
+ struct regid impIdent;
uint8_t impUse[128];
} __attribute__ ((packed));
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
uint8_t partitionMapType;
uint8_t partitionMapLength;
uint8_t reserved1[2];
- regid partIdent;
+ struct regid partIdent;
__le16 volSeqNum;
__le16 partitionNum;
} __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
uint8_t reserved1[2];
- regid partIdent;
+ struct regid partIdent;
__le16 volSeqNum;
__le16 partitionNum;
uint8_t reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
uint8_t reserved1[2];
- regid partIdent;
+ struct regid partIdent;
__le16 volSeqNum;
__le16 partitionNum;
__le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
uint8_t reserved1[2];
- regid partIdent;
+ struct regid partIdent;
__le16 volSeqNum;
__le16 partitionNum;
__le32 metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
/* Virtual Allocation Table (UDF 1.5 2.2.10) */
struct virtualAllocationTable15 {
__le32 VirtualSector[0];
- regid vatIdent;
+ struct regid vatIdent;
__le32 previousVATICBLoc;
} __attribute__ ((packed));
@@ -192,8 +192,8 @@ struct sparingEntry {
} __attribute__ ((packed));
struct sparingTable {
- tag descTag;
- regid sparingIdent;
+ struct tag descTag;
+ struct regid sparingIdent;
__le16 reallocationTableLen;
__le16 reserved;
__le32 sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
#define ICBTAG_FILE_TYPE_MIRROR 0xFB
#define ICBTAG_FILE_TYPE_BITMAP 0xFC
-/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
struct allocDescImpUse {
__le16 flags;
uint8_t impUse[4];
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 96dfd207c3d..4b540ee632d 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
{
struct super_block *sb = inode->i_sb;
struct udf_part_map *map;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
sector_t ext_offset;
struct extent_position epos = {};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e25e7010627..72348cc855a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -81,16 +81,13 @@ static char error_buf[1024];
/* These are the "meat" - everything else is stuffing */
static int udf_fill_super(struct super_block *, void *, int);
static void udf_put_super(struct super_block *);
-static void udf_write_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
static int udf_remount_fs(struct super_block *, int *, char *);
-static int udf_check_valid(struct super_block *, int, int);
-static int udf_vrs(struct super_block *sb, int silent);
-static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
-static void udf_find_anchor(struct super_block *);
-static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
- kernel_lb_addr *);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
+ struct kernel_lb_addr *);
static void udf_load_fileset(struct super_block *, struct buffer_head *,
- kernel_lb_addr *);
+ struct kernel_lb_addr *);
static void udf_open_lvid(struct super_block *);
static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
.delete_inode = udf_delete_inode,
.clear_inode = udf_clear_inode,
.put_super = udf_put_super,
- .write_super = udf_write_super,
+ .sync_fs = udf_sync_fs,
.statfs = udf_statfs,
.remount_fs = udf_remount_fs,
.show_options = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
mode_t umask;
gid_t gid;
uid_t uid;
+ mode_t fmode;
+ mode_t dmode;
struct nls_table *nls_map;
};
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
seq_puts(seq, ",nostrict");
- if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE)
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
seq_printf(seq, ",bs=%lu", sb->s_blocksize);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
seq_printf(seq, ",gid=%u", sbi->s_gid);
if (sbi->s_umask != 0)
seq_printf(seq, ",umask=%o", sbi->s_umask);
+ if (sbi->s_fmode != UDF_INVALID_MODE)
+ seq_printf(seq, ",mode=%o", sbi->s_fmode);
+ if (sbi->s_dmode != UDF_INVALID_MODE)
+ seq_printf(seq, ",dmode=%o", sbi->s_dmode);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
seq_printf(seq, ",session=%u", sbi->s_session);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
- /*
- * s_anchor[2] could be zeroed out in case there is no anchor
- * in the specified block, but then the "anchor=N" option
- * originally given by the user wasn't effective, so it's OK
- * if we don't show it.
- */
- if (sbi->s_anchor[2] != 0)
- seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
+ if (sbi->s_anchor != 0)
+ seq_printf(seq, ",anchor=%u", sbi->s_anchor);
/*
* volume, partition, fileset and rootdir seem to be ignored
* currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
*
* gid= Set the default group.
* umask= Set the default umask.
+ * mode= Set the default file permissions.
+ * dmode= Set the default directory permissions.
* uid= Set the default user.
* bs= Set the block size.
* unhide Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
Opt_rootdir, Opt_utf8, Opt_iocharset,
- Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
+ Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+ Opt_fmode, Opt_dmode
};
static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
{Opt_rootdir, "rootdir=%u"},
{Opt_utf8, "utf8"},
{Opt_iocharset, "iocharset=%s"},
+ {Opt_fmode, "mode=%o"},
+ {Opt_dmode, "dmode=%o"},
{Opt_err, NULL}
};
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
int option;
uopt->novrs = 0;
- uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
uopt->partition = 0xFFFF;
uopt->session = 0xFFFFFFFF;
uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
switch (token) {
case Opt_novrs:
uopt->novrs = 1;
+ break;
case Opt_bs:
if (match_int(&args[0], &option))
return 0;
uopt->blocksize = option;
+ uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
break;
case Opt_unhide:
uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
case Opt_gforget:
uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
break;
+ case Opt_fmode:
+ if (match_octal(args, &option))
+ return 0;
+ uopt->fmode = option & 0777;
+ break;
+ case Opt_dmode:
+ if (match_octal(args, &option))
+ return 0;
+ uopt->dmode = option & 0777;
+ break;
default:
printk(KERN_ERR "udf: bad mount option \"%s\" "
"or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
return 1;
}
-static void udf_write_super(struct super_block *sb)
-{
- lock_kernel();
-
- if (!(sb->s_flags & MS_RDONLY))
- udf_open_lvid(sb);
- sb->s_dirt = 0;
-
- unlock_kernel();
-}
-
static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
{
struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
uopt.uid = sbi->s_uid;
uopt.gid = sbi->s_gid;
uopt.umask = sbi->s_umask;
+ uopt.fmode = sbi->s_fmode;
+ uopt.dmode = sbi->s_dmode;
if (!udf_parse_options(options, &uopt, true))
return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
sbi->s_uid = uopt.uid;
sbi->s_gid = uopt.gid;
sbi->s_umask = uopt.umask;
+ sbi->s_fmode = uopt.fmode;
+ sbi->s_dmode = uopt.dmode;
if (sbi->s_lvid_bh) {
int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
return 0;
}
-static int udf_vrs(struct super_block *sb, int silent)
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
{
struct volStructDesc *vsd = NULL;
loff_t sector = 32768;
int sectorsize;
struct buffer_head *bh = NULL;
- int iso9660 = 0;
int nsr02 = 0;
int nsr03 = 0;
struct udf_sb_info *sbi;
- /* Block size must be a multiple of 512 */
- if (sb->s_blocksize & 511)
- return 0;
sbi = UDF_SB(sb);
-
if (sb->s_blocksize < sizeof(struct volStructDesc))
sectorsize = sizeof(struct volStructDesc);
else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
break;
} else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
VSD_STD_ID_LEN)) {
- iso9660 = sector;
switch (vsd->structType) {
case 0:
udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
return 0;
}
-/*
- * Check whether there is an anchor block in the given block
- */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block)
-{
- struct buffer_head *bh;
- uint16_t ident;
-
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
- udf_fixed_to_variable(block) >=
- sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
- return 0;
-
- bh = udf_read_tagged(sb, block, block, &ident);
- if (!bh)
- return 0;
- brelse(bh);
-
- return ident == TAG_IDENT_AVDP;
-}
-
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
-{
- sector_t last[6];
- int i;
- struct udf_sb_info *sbi = UDF_SB(sb);
-
- last[0] = lastblock;
- last[1] = last[0] - 1;
- last[2] = last[0] + 1;
- last[3] = last[0] - 2;
- last[4] = last[0] - 150;
- last[5] = last[0] - 152;
-
- /* according to spec, anchor is in either:
- * block 256
- * lastblock-256
- * lastblock
- * however, if the disc isn't closed, it could be 512 */
-
- for (i = 0; i < ARRAY_SIZE(last); i++) {
- if (last[i] < 0)
- continue;
- if (last[i] >= sb->s_bdev->bd_inode->i_size >>
- sb->s_blocksize_bits)
- continue;
-
- if (udf_check_anchor_block(sb, last[i])) {
- sbi->s_anchor[0] = last[i];
- sbi->s_anchor[1] = last[i] - 256;
- return last[i];
- }
-
- if (last[i] < 256)
- continue;
-
- if (udf_check_anchor_block(sb, last[i] - 256)) {
- sbi->s_anchor[1] = last[i] - 256;
- return last[i];
- }
- }
-
- if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
- sbi->s_anchor[0] = sbi->s_session + 256;
- return last[0];
- }
- if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
- sbi->s_anchor[0] = sbi->s_session + 512;
- return last[0];
- }
- return 0;
-}
-
-/*
- * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
- * be the last block on the media.
- *
- * Return 1 if not found, 0 if ok
- *
- */
-static void udf_find_anchor(struct super_block *sb)
-{
- sector_t lastblock;
- struct buffer_head *bh = NULL;
- uint16_t ident;
- int i;
- struct udf_sb_info *sbi = UDF_SB(sb);
-
- lastblock = udf_scan_anchors(sb, sbi->s_last_block);
- if (lastblock)
- goto check_anchor;
-
- /* No anchor found? Try VARCONV conversion of block numbers */
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
- /* Firstly, we try to not convert number of the last block */
- lastblock = udf_scan_anchors(sb,
- udf_variable_to_fixed(sbi->s_last_block));
- if (lastblock)
- goto check_anchor;
-
- /* Secondly, we try with converted number of the last block */
- lastblock = udf_scan_anchors(sb, sbi->s_last_block);
- if (!lastblock) {
- /* VARCONV didn't help. Clear it. */
- UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
- }
-
-check_anchor:
- /*
- * Check located anchors and the anchor block supplied via
- * mount options
- */
- for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
- if (!sbi->s_anchor[i])
- continue;
- bh = udf_read_tagged(sb, sbi->s_anchor[i],
- sbi->s_anchor[i], &ident);
- if (!bh)
- sbi->s_anchor[i] = 0;
- else {
- brelse(bh);
- if (ident != TAG_IDENT_AVDP)
- sbi->s_anchor[i] = 0;
- }
- }
-
- sbi->s_last_block = lastblock;
-}
-
static int udf_find_fileset(struct super_block *sb,
- kernel_lb_addr *fileset,
- kernel_lb_addr *root)
+ struct kernel_lb_addr *fileset,
+ struct kernel_lb_addr *root)
{
struct buffer_head *bh = NULL;
long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
if (fileset->logicalBlockNum != 0xFFFFFFFF ||
fileset->partitionReferenceNum != 0xFFFF) {
- bh = udf_read_ptagged(sb, *fileset, 0, &ident);
+ bh = udf_read_ptagged(sb, fileset, 0, &ident);
if (!bh) {
return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
sbi = UDF_SB(sb);
if (!bh) {
/* Search backwards through the partitions */
- kernel_lb_addr newfileset;
+ struct kernel_lb_addr newfileset;
/* --> cvg: FIXME - is it reasonable? */
return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
newfileset.logicalBlockNum = 0;
do {
- bh = udf_read_ptagged(sb, newfileset, 0,
+ bh = udf_read_ptagged(sb, &newfileset, 0,
&ident);
if (!bh) {
newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
struct primaryVolDesc *pvoldesc;
- struct ustr instr;
- struct ustr outstr;
+ struct ustr *instr, *outstr;
struct buffer_head *bh;
uint16_t ident;
+ int ret = 1;
+
+ instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+ if (!instr)
+ return 1;
+
+ outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+ if (!outstr)
+ goto out1;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
- return 1;
+ goto out2;
+
BUG_ON(ident != TAG_IDENT_PVD);
pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
pvoldesc->recordingDateAndTime)) {
#ifdef UDFFS_DEBUG
- timestamp *ts = &pvoldesc->recordingDateAndTime;
+ struct timestamp *ts = &pvoldesc->recordingDateAndTime;
udf_debug("recording time %04u/%02u/%02u"
" %02u:%02u (%x)\n",
le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
#endif
}
- if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
- if (udf_CS0toUTF8(&outstr, &instr)) {
- strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name,
- outstr.u_len > 31 ? 31 : outstr.u_len);
+ if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
+ if (udf_CS0toUTF8(outstr, instr)) {
+ strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+ outstr->u_len > 31 ? 31 : outstr->u_len);
udf_debug("volIdent[] = '%s'\n",
UDF_SB(sb)->s_volume_ident);
}
- if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
- if (udf_CS0toUTF8(&outstr, &instr))
- udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+ if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
+ if (udf_CS0toUTF8(outstr, instr))
+ udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
brelse(bh);
- return 0;
+ ret = 0;
+out2:
+ kfree(outstr);
+out1:
+ kfree(instr);
+ return ret;
}
static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
struct udf_meta_data *mdata;
- kernel_lb_addr addr;
+ struct kernel_lb_addr addr;
int fe_error = 0;
map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Metadata file location: block = %d part = %d\n",
addr.logicalBlockNum, addr.partitionReferenceNum);
- mdata->s_metadata_fe = udf_iget(sb, addr);
+ mdata->s_metadata_fe = udf_iget(sb, &addr);
if (mdata->s_metadata_fe == NULL) {
udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Mirror metadata file location: block = %d part = %d\n",
addr.logicalBlockNum, addr.partitionReferenceNum);
- mdata->s_mirror_fe = udf_iget(sb, addr);
+ mdata->s_mirror_fe = udf_iget(sb, &addr);
if (mdata->s_mirror_fe == NULL) {
if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
udf_debug("Bitmap file location: block = %d part = %d\n",
addr.logicalBlockNum, addr.partitionReferenceNum);
- mdata->s_bitmap_fe = udf_iget(sb, addr);
+ mdata->s_bitmap_fe = udf_iget(sb, &addr);
if (mdata->s_bitmap_fe == NULL) {
if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
}
static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
- kernel_lb_addr *root)
+ struct kernel_lb_addr *root)
{
struct fileSetDesc *fset;
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
if (phd->unallocSpaceTable.extLength) {
- kernel_lb_addr loc = {
+ struct kernel_lb_addr loc = {
.logicalBlockNum = le32_to_cpu(
phd->unallocSpaceTable.extPosition),
.partitionReferenceNum = p_index,
};
- map->s_uspace.s_table = udf_iget(sb, loc);
+ map->s_uspace.s_table = udf_iget(sb, &loc);
if (!map->s_uspace.s_table) {
udf_debug("cannot load unallocSpaceTable (part %d)\n",
p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
udf_debug("partitionIntegrityTable (part %d)\n", p_index);
if (phd->freedSpaceTable.extLength) {
- kernel_lb_addr loc = {
+ struct kernel_lb_addr loc = {
.logicalBlockNum = le32_to_cpu(
phd->freedSpaceTable.extPosition),
.partitionReferenceNum = p_index,
};
- map->s_fspace.s_table = udf_iget(sb, loc);
+ map->s_fspace.s_table = udf_iget(sb, &loc);
if (!map->s_fspace.s_table) {
udf_debug("cannot load freedSpaceTable (part %d)\n",
p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map = &sbi->s_partmaps[p_index];
- kernel_lb_addr ino;
+ struct kernel_lb_addr ino;
struct buffer_head *bh = NULL;
struct udf_inode_info *vati;
uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
/* VAT file entry is in the last recorded block */
ino.partitionReferenceNum = type1_index;
ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
- sbi->s_vat_inode = udf_iget(sb, ino);
+ sbi->s_vat_inode = udf_iget(sb, &ino);
if (!sbi->s_vat_inode)
return 1;
@@ -1322,7 +1208,7 @@ out_bh:
}
static int udf_load_logicalvol(struct super_block *sb, sector_t block,
- kernel_lb_addr *fileset)
+ struct kernel_lb_addr *fileset)
{
struct logicalVolDesc *lvd;
int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
}
if (fileset) {
- long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]);
+ struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
*fileset = lelb_to_cpu(la->extLocation);
udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
* udf_load_logicalvolint
*
*/
-static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
{
struct buffer_head *bh = NULL;
uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
* Written, tested, and released.
*/
static noinline int udf_process_sequence(struct super_block *sb, long block,
- long lastblock, kernel_lb_addr *fileset)
+ long lastblock, struct kernel_lb_addr *fileset)
{
struct buffer_head *bh = NULL;
struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
return 0;
}
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+ struct kernel_lb_addr *fileset)
+{
+ struct anchorVolDescPtr *anchor;
+ long main_s, main_e, reserve_s, reserve_e;
+ struct udf_sb_info *sbi;
+
+ sbi = UDF_SB(sb);
+ anchor = (struct anchorVolDescPtr *)bh->b_data;
+
+ /* Locate the main sequence */
+ main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+ main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+ main_e = main_e >> sb->s_blocksize_bits;
+ main_e += main_s;
+
+ /* Locate the reserve sequence */
+ reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+ reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+ reserve_e = reserve_e >> sb->s_blocksize_bits;
+ reserve_e += reserve_s;
+
+ /* Process the main & reserve sequences */
+ /* responsible for finding the PartitionDesc(s) */
+ if (!udf_process_sequence(sb, main_s, main_e, fileset))
+ return 1;
+ return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+}
+
/*
- * udf_check_valid()
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
*/
-static int udf_check_valid(struct super_block *sb, int novrs, int silent)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+ struct kernel_lb_addr *fileset)
{
- long block;
- struct udf_sb_info *sbi = UDF_SB(sb);
+ struct buffer_head *bh;
+ uint16_t ident;
+ int ret;
- if (novrs) {
- udf_debug("Validity check skipped because of novrs option\n");
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+ udf_fixed_to_variable(block) >=
+ sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+ return 0;
+
+ bh = udf_read_tagged(sb, block, block, &ident);
+ if (!bh)
+ return 0;
+ if (ident != TAG_IDENT_AVDP) {
+ brelse(bh);
return 0;
}
- /* Check that it is NSR02 compliant */
- /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
- block = udf_vrs(sb, silent);
- if (block == -1)
- udf_debug("Failed to read byte 32768. Assuming open "
- "disc. Skipping validity check\n");
- if (block && !sbi->s_last_block)
- sbi->s_last_block = udf_get_last_block(sb);
- return !block;
+ ret = udf_load_sequence(sb, bh, fileset);
+ brelse(bh);
+ return ret;
}
-static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
+ struct kernel_lb_addr *fileset)
{
- struct anchorVolDescPtr *anchor;
- uint16_t ident;
- struct buffer_head *bh;
- long main_s, main_e, reserve_s, reserve_e;
+ sector_t last[6];
int i;
- struct udf_sb_info *sbi;
-
- if (!sb)
- return 1;
- sbi = UDF_SB(sb);
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ int last_count = 0;
- for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
- if (!sbi->s_anchor[i])
+ /* First try user provided anchor */
+ if (sbi->s_anchor) {
+ if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
+ return lastblock;
+ }
+ /*
+ * according to spec, anchor is in either:
+ * block 256
+ * lastblock-256
+ * lastblock
+ * however, if the disc isn't closed, it could be 512.
+ */
+ if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
+ return lastblock;
+ /*
+ * The trouble is which block is the last one. Drives often misreport
+ * this so we try various possibilities.
+ */
+ last[last_count++] = lastblock;
+ if (lastblock >= 1)
+ last[last_count++] = lastblock - 1;
+ last[last_count++] = lastblock + 1;
+ if (lastblock >= 2)
+ last[last_count++] = lastblock - 2;
+ if (lastblock >= 150)
+ last[last_count++] = lastblock - 150;
+ if (lastblock >= 152)
+ last[last_count++] = lastblock - 152;
+
+ for (i = 0; i < last_count; i++) {
+ if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+ sb->s_blocksize_bits)
continue;
-
- bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
- &ident);
- if (!bh)
+ if (udf_check_anchor_block(sb, last[i], fileset))
+ return last[i];
+ if (last[i] < 256)
continue;
+ if (udf_check_anchor_block(sb, last[i] - 256, fileset))
+ return last[i];
+ }
- anchor = (struct anchorVolDescPtr *)bh->b_data;
+ /* Finally try block 512 in case media is open */
+ if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
+ return last[0];
+ return 0;
+}
- /* Locate the main sequence */
- main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
- main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
- main_e = main_e >> sb->s_blocksize_bits;
- main_e += main_s;
+/*
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
+ * area specified by it. The function expects sbi->s_lastblock to be the last
+ * block on the media.
+ *
+ * Return 1 if ok, 0 if not found.
+ *
+ */
+static int udf_find_anchor(struct super_block *sb,
+ struct kernel_lb_addr *fileset)
+{
+ sector_t lastblock;
+ struct udf_sb_info *sbi = UDF_SB(sb);
- /* Locate the reserve sequence */
- reserve_s = le32_to_cpu(
- anchor->reserveVolDescSeqExt.extLocation);
- reserve_e = le32_to_cpu(
- anchor->reserveVolDescSeqExt.extLength);
- reserve_e = reserve_e >> sb->s_blocksize_bits;
- reserve_e += reserve_s;
+ lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+ if (lastblock)
+ goto out;
- brelse(bh);
+ /* No anchor found? Try VARCONV conversion of block numbers */
+ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ /* Firstly, we try to not convert number of the last block */
+ lastblock = udf_scan_anchors(sb,
+ udf_variable_to_fixed(sbi->s_last_block),
+ fileset);
+ if (lastblock)
+ goto out;
- /* Process the main & reserve sequences */
- /* responsible for finding the PartitionDesc(s) */
- if (!(udf_process_sequence(sb, main_s, main_e,
- fileset) &&
- udf_process_sequence(sb, reserve_s, reserve_e,
- fileset)))
- break;
+ /* Secondly, we try with converted number of the last block */
+ lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+ if (!lastblock) {
+ /* VARCONV didn't help. Clear it. */
+ UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+ return 0;
}
+out:
+ sbi->s_last_block = lastblock;
+ return 1;
+}
- if (i == ARRAY_SIZE(sbi->s_anchor)) {
- udf_debug("No Anchor block found\n");
- return 1;
+/*
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
+ * Descriptor Sequence
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+ int silent, struct kernel_lb_addr *fileset)
+{
+ struct udf_sb_info *sbi = UDF_SB(sb);
+ loff_t nsr_off;
+
+ if (!sb_set_blocksize(sb, uopt->blocksize)) {
+ if (!silent)
+ printk(KERN_WARNING "UDF-fs: Bad block size\n");
+ return 0;
+ }
+ sbi->s_last_block = uopt->lastblock;
+ if (!uopt->novrs) {
+ /* Check that it is NSR02 compliant */
+ nsr_off = udf_check_vsd(sb);
+ if (!nsr_off) {
+ if (!silent)
+ printk(KERN_WARNING "UDF-fs: No VRS found\n");
+ return 0;
+ }
+ if (nsr_off == -1)
+ udf_debug("Failed to read byte 32768. Assuming open "
+ "disc. Skipping validity check\n");
+ if (!sbi->s_last_block)
+ sbi->s_last_block = udf_get_last_block(sb);
+ } else {
+ udf_debug("Validity check skipped because of novrs option\n");
}
- udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
- return 0;
+ /* Look for anchor block and load Volume Descriptor Sequence */
+ sbi->s_anchor = uopt->anchor;
+ if (!udf_find_anchor(sb, fileset)) {
+ if (!silent)
+ printk(KERN_WARNING "UDF-fs: No anchor found\n");
+ return 0;
+ }
+ return 1;
}
static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
+
if (!bh)
return;
-
lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
lvidiu = udf_sb_lvidiu(sbi);
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
CURRENT_TIME);
- lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+ lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(tag),
+ crc_itu_t(0, (char *)lvid + sizeof(struct tag),
le16_to_cpu(lvid->descTag.descCRCLength)));
lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
mark_buffer_dirty(bh);
+ sbi->s_lvid_dirty = 0;
}
static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
return;
lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-
- if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
- return;
-
lvidiu = udf_sb_lvidiu(sbi);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(tag),
+ crc_itu_t(0, (char *)lvid + sizeof(struct tag),
le16_to_cpu(lvid->descTag.descCRCLength)));
lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
mark_buffer_dirty(bh);
+ sbi->s_lvid_dirty = 0;
}
static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
static int udf_fill_super(struct super_block *sb, void *options, int silent)
{
int i;
+ int ret;
struct inode *inode = NULL;
struct udf_options uopt;
- kernel_lb_addr rootdir, fileset;
+ struct kernel_lb_addr rootdir, fileset;
struct udf_sb_info *sbi;
uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
uopt.uid = -1;
uopt.gid = -1;
uopt.umask = 0;
+ uopt.fmode = UDF_INVALID_MODE;
+ uopt.dmode = UDF_INVALID_MODE;
sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sbi->s_uid = uopt.uid;
sbi->s_gid = uopt.gid;
sbi->s_umask = uopt.umask;
+ sbi->s_fmode = uopt.fmode;
+ sbi->s_dmode = uopt.dmode;
sbi->s_nls_map = uopt.nls_map;
- /* Set the block size for all transfers */
- if (!sb_min_blocksize(sb, uopt.blocksize)) {
- udf_debug("Bad block size (%d)\n", uopt.blocksize);
- printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
- goto error_out;
- }
-
if (uopt.session == 0xFFFFFFFF)
sbi->s_session = udf_get_last_session(sb);
else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
udf_debug("Multi-session=%d\n", sbi->s_session);
- sbi->s_last_block = uopt.lastblock;
- sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
- sbi->s_anchor[2] = uopt.anchor;
-
- if (udf_check_valid(sb, uopt.novrs, silent)) {
- /* read volume recognition sequences */
- printk(KERN_WARNING "UDF-fs: No VRS found\n");
- goto error_out;
- }
-
- udf_find_anchor(sb);
-
/* Fill in the rest of the superblock */
sb->s_op = &udf_sb_ops;
sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
sb->s_magic = UDF_SUPER_MAGIC;
sb->s_time_gran = 1000;
- if (udf_load_sequence(sb, &fileset)) {
+ if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+ ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ } else {
+ uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+ ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+ if (!silent)
+ printk(KERN_NOTICE
+ "UDF-fs: Rescanning with blocksize "
+ "%d\n", UDF_DEFAULT_BLOCKSIZE);
+ uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+ ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+ }
+ }
+ if (!ret) {
printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
goto error_out;
}
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
if (!silent) {
- timestamp ts;
+ struct timestamp ts;
udf_time_to_disk_stamp(&ts, sbi->s_record_time);
udf_info("UDF: Mounting volume '%s', "
"timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
/* Assign the root inode */
/* assign inodes by physical block number */
/* perhaps it's not extensible enough, but for now ... */
- inode = udf_iget(sb, rootdir);
+ inode = udf_iget(sb, &rootdir);
if (!inode) {
printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
"partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+ struct udf_sb_info *sbi = UDF_SB(sb);
+
+ mutex_lock(&sbi->s_alloc_mutex);
+ if (sbi->s_lvid_dirty) {
+ /*
+ * Blockdevice will be synced later so we don't have to submit
+ * the buffer for IO
+ */
+ mark_buffer_dirty(sbi->s_lvid_bh);
+ sb->s_dirt = 0;
+ sbi->s_lvid_dirty = 0;
+ }
+ mutex_unlock(&sbi->s_alloc_mutex);
+
+ return 0;
+}
+
static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct udf_sb_info *sbi = UDF_SB(sb);
struct logicalVolIntegrityDescImpUse *lvidiu;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
if (sbi->s_lvid_bh != NULL)
lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
le32_to_cpu(lvidiu->numDirs)) : 0)
+ buf->f_bfree;
buf->f_ffree = buf->f_bfree;
- /* __kernel_fsid_t f_fsid */
buf->f_namelen = UDF_NAME_LEN - 2;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
return 0;
}
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
unsigned int accum = 0;
int index;
int block = 0, newblock;
- kernel_lb_addr loc;
+ struct kernel_lb_addr loc;
uint32_t bytes;
uint8_t *ptr;
uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
loc.logicalBlockNum = bitmap->s_extPosition;
loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
- bh = udf_read_ptagged(sb, loc, 0, &ident);
+ bh = udf_read_ptagged(sb, &loc, 0, &ident);
if (!bh) {
printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
bytes -= cur_bytes;
if (bytes) {
brelse(bh);
- newblock = udf_get_lb_pblock(sb, loc, ++block);
+ newblock = udf_get_lb_pblock(sb, &loc, ++block);
bh = udf_tread(sb, newblock);
if (!bh) {
udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
{
unsigned int accum = 0;
uint32_t elen;
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
int8_t etype;
struct extent_position epos;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 65e19b4f942..225527cdc88 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,10 +28,10 @@
#include "udf_sb.h"
static void extent_trunc(struct inode *inode, struct extent_position *epos,
- kernel_lb_addr eloc, int8_t etype, uint32_t elen,
+ struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
uint32_t nelen)
{
- kernel_lb_addr neloc = {};
+ struct kernel_lb_addr neloc = {};
int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
last_block);
etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
} else
- neloc = eloc;
+ neloc = *eloc;
nelen = (etype << 30) | nelen;
}
if (elen != nelen) {
- udf_write_aext(inode, epos, neloc, nelen, 0);
+ udf_write_aext(inode, epos, &neloc, nelen, 0);
if (last_block - first_block > 0) {
if (etype == (EXT_RECORDED_ALLOCATED >> 30))
mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
void udf_truncate_tail_extent(struct inode *inode)
{
struct extent_position epos = {};
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen, nelen;
uint64_t lbcount = 0;
int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
return;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
BUG();
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
(unsigned)elen);
nelen = elen - (lbcount - inode->i_size);
epos.offset -= adsize;
- extent_trunc(inode, &epos, eloc, etype, elen, nelen);
+ extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
epos.offset += adsize;
if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
void udf_discard_prealloc(struct inode *inode)
{
struct extent_position epos = { NULL, 0, {0, 0} };
- kernel_lb_addr eloc;
+ struct kernel_lb_addr eloc;
uint32_t elen;
uint64_t lbcount = 0;
int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
return;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
adsize = 0;
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
epos.offset -= adsize;
lbcount -= elen;
- extent_trunc(inode, &epos, eloc, etype, elen, 0);
+ extent_trunc(inode, &epos, &eloc, etype, elen, 0);
if (!epos.bh) {
iinfo->i_lenAlloc =
epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
void udf_truncate_extents(struct inode *inode)
{
struct extent_position epos;
- kernel_lb_addr eloc, neloc = {};
+ struct kernel_lb_addr eloc, neloc = {};
uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
int8_t etype;
struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
struct udf_inode_info *iinfo = UDF_I(inode);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(short_ad);
+ adsize = sizeof(struct short_ad);
else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(long_ad);
+ adsize = sizeof(struct long_ad);
else
BUG();
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
(inode->i_size & (sb->s_blocksize - 1));
if (etype != -1) {
epos.offset -= adsize;
- extent_trunc(inode, &epos, eloc, etype, elen, byte_offset);
+ extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
epos.offset += adsize;
if (byte_offset)
lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
while ((etype = udf_current_aext(inode, &epos, &eloc,
&elen, 0)) != -1) {
if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
- udf_write_aext(inode, &epos, neloc, nelen, 0);
+ udf_write_aext(inode, &epos, &neloc, nelen, 0);
if (indirect_ext_len) {
/* We managed to free all extents in the
* indirect extent - free it too */
BUG_ON(!epos.bh);
- udf_free_blocks(sb, inode, epos.block,
+ udf_free_blocks(sb, inode, &epos.block,
0, indirect_ext_len);
} else if (!epos.bh) {
iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
epos.offset = sizeof(struct allocExtDesc);
epos.block = eloc;
epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, eloc, 0));
+ udf_get_lb_pblock(sb, &eloc, 0));
if (elen)
indirect_ext_len =
(elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
else
indirect_ext_len = 1;
} else {
- extent_trunc(inode, &epos, eloc, etype,
+ extent_trunc(inode, &epos, &eloc, etype,
elen, 0);
epos.offset += adsize;
}
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
if (indirect_ext_len) {
BUG_ON(!epos.bh);
- udf_free_blocks(sb, inode, epos.block, 0,
+ udf_free_blocks(sb, inode, &epos.block, 0,
indirect_ext_len);
} else if (!epos.bh) {
iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
udf_update_alloc_ext_desc(inode, &epos, lenalloc);
} else if (inode->i_size) {
if (byte_offset) {
- kernel_long_ad extent;
+ struct kernel_long_ad extent;
/*
* OK, there is not extent covering inode->i_size and
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4f86b1d98a5..e58d1de4107 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -4,7 +4,7 @@
struct udf_inode_info {
struct timespec i_crtime;
/* Physical address of inode */
- kernel_lb_addr i_location;
+ struct kernel_lb_addr i_location;
__u64 i_unique;
__u32 i_lenEAttr;
__u32 i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
unsigned i_strat4096 : 1;
unsigned reserved : 26;
union {
- short_ad *i_sad;
- long_ad *i_lad;
+ struct short_ad *i_sad;
+ struct long_ad *i_lad;
__u8 *i_data;
} i_ext;
struct inode vfs_inode;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 1c1c514a972..d113b72c276 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,6 +30,7 @@
#define UDF_FLAG_GID_SET 16
#define UDF_FLAG_SESSION_SET 17
#define UDF_FLAG_LASTBLOCK_SET 18
+#define UDF_FLAG_BLOCKSIZE_SET 19
#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
@@ -48,6 +49,8 @@
#define UDF_SPARABLE_MAP15 0x1522U
#define UDF_METADATA_MAP25 0x2511U
+#define UDF_INVALID_MODE ((mode_t)-1)
+
#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
/* Sector headers */
__s32 s_session;
- __u32 s_anchor[3];
+ __u32 s_anchor;
__u32 s_last_block;
struct buffer_head *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
mode_t s_umask;
gid_t s_gid;
uid_t s_uid;
+ mode_t s_fmode;
+ mode_t s_dmode;
/* Root Info */
struct timespec s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
struct inode *s_vat_inode;
struct mutex s_alloc_mutex;
+ /* Protected by s_alloc_mutex */
+ unsigned int s_lvid_dirty;
};
static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8ec865de5f1..cac51b77a5d 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
return 0;
}
-#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
-
/* computes tag checksum */
-u8 udf_tag_checksum(const tag *t);
+u8 udf_tag_checksum(const struct tag *t);
struct dentry;
struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
};
struct generic_desc {
- tag descTag;
+ struct tag descTag;
__le32 volDescSeqNum;
};
@@ -108,11 +106,22 @@ struct ustr {
struct extent_position {
struct buffer_head *bh;
uint32_t offset;
- kernel_lb_addr block;
+ struct kernel_lb_addr block;
};
/* super.c */
extern void udf_warning(struct super_block *, const char *, const char *, ...);
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+ struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+
+ BUG_ON(!bh);
+ WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+ bh->b_data)->integrityType !=
+ cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+ sb->s_dirt = 1;
+ UDF_SB(sb)->s_lvid_dirty = 1;
+}
/* namei.c */
extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
unsigned long);
/* inode.c */
-extern struct inode *udf_iget(struct super_block *, kernel_lb_addr);
+extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
extern int udf_sync_inode(struct inode *);
extern void udf_expand_file_adinicb(struct inode *, int, int *);
extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
extern int udf_write_inode(struct inode *, int);
extern long udf_block_map(struct inode *, sector_t);
extern int udf_extend_file(struct inode *, struct extent_position *,
- kernel_long_ad *, sector_t);
+ struct kernel_long_ad *, sector_t);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
- kernel_lb_addr *, uint32_t *, sector_t *);
+ struct kernel_lb_addr *, uint32_t *, sector_t *);
extern int8_t udf_add_aext(struct inode *, struct extent_position *,
- kernel_lb_addr, uint32_t, int);
+ struct kernel_lb_addr *, uint32_t, int);
extern int8_t udf_write_aext(struct inode *, struct extent_position *,
- kernel_lb_addr, uint32_t, int);
+ struct kernel_lb_addr *, uint32_t, int);
extern int8_t udf_delete_aext(struct inode *, struct extent_position,
- kernel_lb_addr, uint32_t);
+ struct kernel_lb_addr, uint32_t);
extern int8_t udf_next_aext(struct inode *, struct extent_position *,
- kernel_lb_addr *, uint32_t *, int);
+ struct kernel_lb_addr *, uint32_t *, int);
extern int8_t udf_current_aext(struct inode *, struct extent_position *,
- kernel_lb_addr *, uint32_t *, int);
+ struct kernel_lb_addr *, uint32_t *, int);
/* misc.c */
extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
uint32_t, uint16_t *);
extern struct buffer_head *udf_read_ptagged(struct super_block *,
- kernel_lb_addr, uint32_t,
+ struct kernel_lb_addr *, uint32_t,
uint16_t *);
extern void udf_update_tag(char *, int);
extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
uint32_t);
extern int udf_relocate_blocks(struct super_block *, long, long *);
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+ uint32_t offset)
+{
+ return udf_get_pblock(sb, loc->logicalBlockNum,
+ loc->partitionReferenceNum, offset);
+}
+
/* unicode.c */
extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
/* balloc.c */
extern void udf_free_blocks(struct super_block *, struct inode *,
- kernel_lb_addr, uint32_t, uint32_t);
+ struct kernel_lb_addr *, uint32_t, uint32_t);
extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
uint32_t, uint32_t);
extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
struct udf_fileident_bh *,
struct fileIdentDesc *,
struct extent_position *,
- kernel_lb_addr *, uint32_t *,
+ struct kernel_lb_addr *, uint32_t *,
sector_t *);
extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
int *offset);
-extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
-extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
/* udftime.c */
extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
- timestamp src);
-extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
+ struct timestamp src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index 489f52fb428..6a9f3a9cc42 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -4,9 +4,9 @@
#include <asm/byteorder.h>
#include <linux/string.h>
-static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
{
- kernel_lb_addr out;
+ struct kernel_lb_addr out;
out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
return out;
}
-static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
{
- lb_addr out;
+ struct lb_addr out;
out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
return out;
}
-static inline short_ad lesa_to_cpu(short_ad in)
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
{
- short_ad out;
+ struct short_ad out;
out.extLength = le32_to_cpu(in.extLength);
out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
return out;
}
-static inline short_ad cpu_to_lesa(short_ad in)
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
{
- short_ad out;
+ struct short_ad out;
out.extLength = cpu_to_le32(in.extLength);
out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
return out;
}
-static inline kernel_long_ad lela_to_cpu(long_ad in)
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
{
- kernel_long_ad out;
+ struct kernel_long_ad out;
out.extLength = le32_to_cpu(in.extLength);
out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
return out;
}
-static inline long_ad cpu_to_lela(kernel_long_ad in)
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
{
- long_ad out;
+ struct long_ad out;
out.extLength = cpu_to_le32(in.extLength);
out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
return out;
}
-static inline kernel_extent_ad leea_to_cpu(extent_ad in)
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
{
- kernel_extent_ad out;
+ struct kernel_extent_ad out;
out.extLength = le32_to_cpu(in.extLength);
out.extLocation = le32_to_cpu(in.extLocation);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 5f811655c9b..b8c828c4d20 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
#define SECS_PER_HOUR (60 * 60)
#define SECS_PER_DAY (SECS_PER_HOUR * 24)
-struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
{
int yday;
u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
return dest;
}
-timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
{
long int days, rem, y;
const unsigned short int *ip;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 9fdf8c93c58..cefa8c8913e 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
{
const uint8_t *ocu;
uint8_t cmp_id, ocu_len;
- int i;
+ int i, len;
ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
if (cmp_id == 16)
c = (c << 8) | ocu[i++];
- utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - utf_o->u_len);
+ len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+ UDF_NAME_LEN - utf_o->u_len);
+ /* Valid character? */
+ if (len >= 0)
+ utf_o->u_len += len;
+ else
+ utf_o->u_name[utf_o->u_len++] = '?';
}
utf_o->u_cmpID = 8;
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
int length)
{
- unsigned len, i, max_val;
+ int len;
+ unsigned i, max_val;
uint16_t uni_char;
int u_len;
@@ -302,8 +308,13 @@ try_again:
u_len = 0U;
for (i = 0U; i < uni->u_len; i++) {
len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
- if (len <= 0)
+ if (!len)
continue;
+ /* Invalid character, deal with it */
+ if (len < 0) {
+ len = 1;
+ uni_char = '?';
+ }
if (uni_char > max_val) {
max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
int flen)
{
- struct ustr filename, unifilename;
- int len;
+ struct ustr *filename, *unifilename;
+ int len = 0;
- if (udf_build_ustr_exact(&unifilename, sname, flen))
+ filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+ if (!filename)
return 0;
+ unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+ if (!unifilename)
+ goto out1;
+
+ if (udf_build_ustr_exact(unifilename, sname, flen))
+ goto out2;
+
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- if (!udf_CS0toUTF8(&filename, &unifilename)) {
+ if (!udf_CS0toUTF8(filename, unifilename)) {
udf_debug("Failed in udf_get_filename: sname = %s\n",
sname);
- return 0;
+ goto out2;
}
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
- &unifilename)) {
+ if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+ unifilename)) {
udf_debug("Failed in udf_get_filename: sname = %s\n",
sname);
- return 0;
+ goto out2;
}
} else
- return 0;
-
- len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
- unifilename.u_name, unifilename.u_len);
- if (len)
- return len;
-
- return 0;
+ goto out2;
+
+ len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+ unifilename->u_name, unifilename->u_len);
+out2:
+ kfree(unifilename);
+out1:
+ kfree(filename);
+ return len;
}
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e1c1fc5ee23..60359291761 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1268,6 +1268,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ufs_super_block_first *usb1;
struct ufs_super_block_second *usb2;
struct ufs_super_block_third *usb3;
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
lock_kernel();
@@ -1290,6 +1291,8 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
buf->f_files = uspi->s_ncg * uspi->s_ipg;
buf->f_namelen = UFS_MAXNAMLEN;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
unlock_kernel();
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c3dc491fff8..60f107e47fe 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
xfs_qm_syscalls.o \
xfs_qm_bhv.o \
xfs_qm.o)
+xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
ifeq ($(CONFIG_XFS_QUOTA),y)
xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644
index 2a88d56c4dc..00000000000
--- a/fs/xfs/linux-2.6/mutex.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
-
-#include <linux/mutex.h>
-
-typedef struct mutex mutex_t;
-
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index de3a198f771..c13f67300fe 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
.bmap = xfs_vm_bmap,
.direct_IO = xfs_vm_direct_IO,
.migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0..f4e25544157 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
STATIC int
xfs_vm_page_mkwrite(
struct vm_area_struct *vma,
- struct page *page)
+ struct vm_fault *vmf)
{
- return block_page_mkwrite(vma, page, xfs_get_blocks);
+ return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
const struct file_operations xfs_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4bd112313f3..d0b499418a7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -34,6 +34,7 @@
#include "xfs_dir2_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_ioctl.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
+ struct file *file = NULL;
+ struct path path;
+ int error;
+ struct xfs_inode *ip;
- memset((char *)&handle, 0, sizeof(handle));
-
- switch (cmd) {
- case XFS_IOC_PATH_TO_FSHANDLE:
- case XFS_IOC_PATH_TO_HANDLE: {
- struct path path;
- int error = user_lpath((const char __user *)hreq->path, &path);
+ if (cmd == XFS_IOC_FD_TO_HANDLE) {
+ file = fget(hreq->fd);
+ if (!file)
+ return -EBADF;
+ inode = file->f_path.dentry->d_inode;
+ } else {
+ error = user_lpath((const char __user *)hreq->path, &path);
if (error)
return error;
-
- ASSERT(path.dentry);
- ASSERT(path.dentry->d_inode);
- inode = igrab(path.dentry->d_inode);
- path_put(&path);
- break;
+ inode = path.dentry->d_inode;
}
+ ip = XFS_I(inode);
- case XFS_IOC_FD_TO_HANDLE: {
- struct file *file;
-
- file = fget(hreq->fd);
- if (!file)
- return -EBADF;
+ /*
+ * We can only generate handles for inodes residing on a XFS filesystem,
+ * and only for regular files, directories or symbolic links.
+ */
+ error = -EINVAL;
+ if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+ goto out_put;
- ASSERT(file->f_path.dentry);
- ASSERT(file->f_path.dentry->d_inode);
- inode = igrab(file->f_path.dentry->d_inode);
- fput(file);
- break;
- }
+ error = -EBADF;
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ goto out_put;
- default:
- ASSERT(0);
- return -XFS_ERROR(EINVAL);
- }
- if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
- /* we're not in XFS anymore, Toto */
- iput(inode);
- return -XFS_ERROR(EINVAL);
- }
+ memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- break;
- default:
- iput(inode);
- return -XFS_ERROR(EBADF);
- }
-
- /* now we can grab the fsid */
- memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
- sizeof(xfs_fsid_t));
- hsize = sizeof(xfs_fsid_t);
-
- if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
- xfs_inode_t *ip = XFS_I(inode);
+ if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+ /*
+ * This handle only contains an fsid, zero the rest.
+ */
+ memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+ hsize = sizeof(xfs_fsid_t);
+ } else {
int lock_mode;
- /* need to get access to the xfs_inode to read the generation */
lock_mode = xfs_ilock_map_shared(ip);
-
- /* fill in fid section of handle from inode */
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
handle.ha_fid.fid_gen = ip->i_d.di_gen;
handle.ha_fid.fid_ino = ip->i_ino;
-
xfs_iunlock_map_shared(ip, lock_mode);
hsize = XFS_HSIZE(handle);
}
- /* now copy our handle into the user buffer & write out the size */
+ error = -EFAULT;
if (copy_to_user(hreq->ohandle, &handle, hsize) ||
- copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
- iput(inode);
- return -XFS_ERROR(EFAULT);
- }
+ copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+ goto out_put;
- iput(inode);
- return 0;
+ error = 0;
+
+ out_put:
+ if (cmd == XFS_IOC_FD_TO_HANDLE)
+ fput(file);
+ else
+ path_put(&path);
+ return error;
}
/*
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67..6075382336d 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -211,8 +211,13 @@ xfs_vn_mknod(
* Irix uses Missed'em'V split, but doesn't want to see
* the upper 5 bits of (14bit) major.
*/
- if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
- return -EINVAL;
+ if (S_ISCHR(mode) || S_ISBLK(mode)) {
+ if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+ return -EINVAL;
+ rdev = sysv_encode_dev(rdev);
+ } else {
+ rdev = 0;
+ }
if (test_default_acl && test_default_acl(dir)) {
if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
}
}
- xfs_dentry_to_name(&name, dentry);
-
if (IS_POSIXACL(dir) && !default_acl)
- mode &= ~current->fs->umask;
-
- switch (mode & S_IFMT) {
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
- rdev = sysv_encode_dev(rdev);
- case S_IFREG:
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
- break;
- case S_IFDIR:
- error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
- break;
- default:
- error = EINVAL;
- break;
- }
+ mode &= ~current_umask();
+ xfs_dentry_to_name(&name, dentry);
+ error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
if (unlikely(error))
goto out_free_acl;
@@ -416,7 +404,7 @@ xfs_vn_symlink(
mode_t mode;
mode = S_IFLNK |
- (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+ (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
xfs_dentry_to_name(&name, dentry);
error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
@@ -553,9 +541,6 @@ xfs_vn_getattr(
stat->uid = ip->i_d.di_uid;
stat->gid = ip->i_d.di_gid;
stat->ino = ip->i_ino;
-#if XFS_BIG_INUMS
- stat->ino += mp->m_inoadd;
-#endif
stat->atime = inode->i_atime;
stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 507492d6dcc..f65a53f8752 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -38,7 +38,6 @@
#include <kmem.h>
#include <mrlock.h>
#include <sv.h>
-#include <mutex.h>
#include <time.h>
#include <support/ktrace.h>
@@ -51,6 +50,7 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/file.h>
#include <linux/swap.h>
#include <linux/errno.h>
@@ -147,17 +147,6 @@
#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
-/*
- * IRIX (BSD) quotactl makes use of separate commands for user/group,
- * whereas on Linux the syscall encodes this information into the cmd
- * field (see the QCMD macro in quota.h). These macros help keep the
- * code portable - they are not visible from the syscall interface.
- */
-#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */
-#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */
-#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */
-#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */
-
#define dfltprid 0
#define MAXPATHLEN 1024
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 00000000000..94d9a633d3d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_dmapi.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "quota/xfs_qm.h"
+#include <linux/quota.h>
+
+
+STATIC int
+xfs_quota_type(int type)
+{
+ switch (type) {
+ case USRQUOTA:
+ return XFS_DQ_USER;
+ case GRPQUOTA:
+ return XFS_DQ_GROUP;
+ default:
+ return XFS_DQ_PROJ;
+ }
+}
+
+STATIC int
+xfs_fs_quota_sync(
+ struct super_block *sb,
+ int type)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ return -xfs_sync_inodes(mp, SYNC_DELWRI);
+}
+
+STATIC int
+xfs_fs_get_xstate(
+ struct super_block *sb,
+ struct fs_quota_stat *fqs)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+STATIC int
+xfs_fs_set_xstate(
+ struct super_block *sb,
+ unsigned int uflags,
+ int op)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags = 0;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (uflags & XFS_QUOTA_UDQ_ACCT)
+ flags |= XFS_UQUOTA_ACCT;
+ if (uflags & XFS_QUOTA_PDQ_ACCT)
+ flags |= XFS_PQUOTA_ACCT;
+ if (uflags & XFS_QUOTA_GDQ_ACCT)
+ flags |= XFS_GQUOTA_ACCT;
+ if (uflags & XFS_QUOTA_UDQ_ENFD)
+ flags |= XFS_UQUOTA_ENFD;
+ if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+ flags |= XFS_OQUOTA_ENFD;
+
+ switch (op) {
+ case Q_XQUOTAON:
+ return -xfs_qm_scall_quotaon(mp, flags);
+ case Q_XQUOTAOFF:
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+ return -xfs_qm_scall_quotaoff(mp, flags);
+ case Q_XQUOTARM:
+ if (XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+ return -xfs_qm_scall_trunc_qfiles(mp, flags);
+ }
+
+ return -EINVAL;
+}
+
+STATIC int
+xfs_fs_get_xquota(
+ struct super_block *sb,
+ int type,
+ qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+
+ return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+
+STATIC int
+xfs_fs_set_xquota(
+ struct super_block *sb,
+ int type,
+ qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+
+struct quotactl_ops xfs_quotactl_operations = {
+ .quota_sync = xfs_fs_quota_sync,
+ .get_xstate = xfs_fs_get_xstate,
+ .set_xstate = xfs_fs_set_xstate,
+ .get_xquota = xfs_fs_get_xquota,
+ .set_xquota = xfs_fs_set_xquota,
+};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 32ae5028e96..bb685269f83 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -68,7 +68,6 @@
#include <linux/freezer.h>
#include <linux/parser.h>
-static struct quotactl_ops xfs_quotactl_operations;
static struct super_operations xfs_super_operations;
static kmem_zone_t *xfs_ioend_zone;
mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
-#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */
#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
int dswidth = 0;
int iosize = 0;
int dmapi_implies_ikeep = 1;
- uchar_t iosizelog = 0;
+ __uint8_t iosizelog = 0;
/*
* Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
mp->m_flags |= XFS_MOUNT_NORECOVERY;
- } else if (!strcmp(this_char, MNTOPT_INO64)) {
-#if XFS_BIG_INUMS
- mp->m_flags |= XFS_MOUNT_INO64;
- mp->m_inoadd = XFS_INO64_OFFSET;
-#else
- cmn_err(CE_WARN,
- "XFS: %s option not allowed on this system",
- this_char);
- return EINVAL;
-#endif
} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
mp->m_flags |= XFS_MOUNT_NOALIGN;
} else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
/* the few simple ones we can get from the mount struct */
{ XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
{ XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
- { XFS_MOUNT_INO64, "," MNTOPT_INO64 },
{ XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
{ XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
{ XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
return (((__uint64_t)pagefactor) << bitshift) - 1;
}
-int
+STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
return -error;
}
-void
+STATIC void
xfs_blkdev_put(
struct block_device *bdev)
{
@@ -872,7 +859,7 @@ xfsaild_wakeup(
wake_up_process(ailp->xa_task);
}
-int
+STATIC int
xfsaild(
void *data)
{
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
int sync)
{
struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
int error = 0;
- int flags = 0;
xfs_itrace_entry(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
if (sync) {
error = xfs_wait_on_pages(ip, 0, -1);
if (error)
- goto out_error;
- flags |= FLUSH_SYNC;
+ goto out;
}
- error = xfs_inode_flush(ip, flags);
-out_error:
+ /*
+ * Bypass inodes which have already been cleaned by
+ * the inode flush clustering code inside xfs_iflush
+ */
+ if (xfs_inode_clean(ip))
+ goto out;
+
+ /*
+ * We make this non-blocking if the inode is contended, return
+ * EAGAIN to indicate to the caller that they did not succeed.
+ * This prevents the flush path from blocking on inodes inside
+ * another operation right now, they get caught later by xfs_sync.
+ */
+ if (sync) {
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ xfs_iflock(ip);
+
+ error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+ } else {
+ error = EAGAIN;
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ goto out;
+ if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+ goto out_unlock;
+
+ error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+ }
+
+ out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
/*
* if we failed to write out the inode then mark
* it dirty again so we'll try again later.
*/
if (error)
xfs_mark_inode_dirty_sync(ip);
-
return -error;
}
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
statp->f_bfree = statp->f_bavail =
sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
fakeinos = statp->f_bfree << sbp->sb_inopblog;
-#if XFS_BIG_INUMS
- fakeinos += mp->m_inoadd;
-#endif
statp->f_files =
MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
if (mp->m_maxicount)
-#if XFS_BIG_INUMS
- if (!mp->m_inoadd)
-#endif
- statp->f_files = min_t(typeof(statp->f_files),
- statp->f_files,
- mp->m_maxicount);
+ statp->f_files = min_t(typeof(statp->f_files),
+ statp->f_files,
+ mp->m_maxicount);
statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
spin_unlock(&mp->m_sb_lock);
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
}
-STATIC int
-xfs_fs_quotasync(
- struct super_block *sb,
- int type)
-{
- return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
-}
-
-STATIC int
-xfs_fs_getxstate(
- struct super_block *sb,
- struct fs_quota_stat *fqs)
-{
- return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
-}
-
-STATIC int
-xfs_fs_setxstate(
- struct super_block *sb,
- unsigned int flags,
- int op)
-{
- return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
-}
-
-STATIC int
-xfs_fs_getxquota(
- struct super_block *sb,
- int type,
- qid_t id,
- struct fs_disk_quota *fdq)
-{
- return -XFS_QM_QUOTACTL(XFS_M(sb),
- (type == USRQUOTA) ? Q_XGETQUOTA :
- ((type == GRPQUOTA) ? Q_XGETGQUOTA :
- Q_XGETPQUOTA), id, (caddr_t)fdq);
-}
-
-STATIC int
-xfs_fs_setxquota(
- struct super_block *sb,
- int type,
- qid_t id,
- struct fs_disk_quota *fdq)
-{
- return -XFS_QM_QUOTACTL(XFS_M(sb),
- (type == USRQUOTA) ? Q_XSETQLIM :
- ((type == GRPQUOTA) ? Q_XSETGQLIM :
- Q_XSETPQLIM), id, (caddr_t)fdq);
-}
-
/*
* This function fills in xfs_mount_t fields based on mount args.
* Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
sb->s_qcop = &xfs_quotactl_operations;
+#endif
sb->s_op = &xfs_super_operations;
error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
.show_options = xfs_fs_show_options,
};
-static struct quotactl_ops xfs_quotactl_operations = {
- .quota_sync = xfs_fs_quotasync,
- .get_xstate = xfs_fs_getxstate,
- .set_xstate = xfs_fs_setxstate,
- .get_xquota = xfs_fs_getxquota,
- .set_xquota = xfs_fs_setxquota,
-};
-
static struct file_system_type xfs_fs_type = {
.owner = THIS_MODULE,
.name = "xfs",
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index d5d776d4cd6..5a2ea3a2178 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
extern struct xattr_handler *xfs_xattr_handlers[];
+extern struct quotactl_ops xfs_quotactl_operations;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1efe1f..04f058c848a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -19,6 +19,7 @@
#define XFS_SYNC_H 1
struct xfs_mount;
+struct xfs_perag;
typedef struct bhv_vfs_sync_work {
struct list_head w_list;
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f65983a230d..ad7fbead4c9 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -41,11 +41,6 @@ struct attrlist_cursor_kern;
#define IO_INVIS 0x00020 /* don't update inode timestamps */
/*
- * Flags for xfs_inode_flush
- */
-#define FLUSH_SYNC 1 /* wait for flush to complete */
-
-/*
* Flush/Invalidate options for vop_toss/flush/flushinval_pages.
*/
#define FI_NONE 0 /* none */
@@ -55,33 +50,6 @@ struct attrlist_cursor_kern;
the operation completes. */
/*
- * Dealing with bad inodes
- */
-static inline int VN_BAD(struct inode *vp)
-{
- return is_bad_inode(vp);
-}
-
-/*
- * Extracting atime values in various formats
- */
-static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
-{
- bs_atime->tv_sec = vp->i_atime.tv_sec;
- bs_atime->tv_nsec = vp->i_atime.tv_nsec;
-}
-
-static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
-{
- *ts = vp->i_atime;
-}
-
-static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
-{
- *tt = vp->i_atime.tv_sec;
-}
-
-/*
* Some useful predicates.
*/
#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 6543c0b2975..e4babcc6342 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
uint flist_locked;
xfs_dquot_t *d;
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+ ASSERT(mutex_is_locked(&qh->qh_lock));
flist_locked = B_FALSE;
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
/*
* move the dquot to the front of the hashchain
*/
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+ ASSERT(mutex_is_locked(&qh->qh_lock));
if (dqp->HL_PREVP != &qh->qh_next) {
xfs_dqtrace_entry(dqp,
"DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
}
xfs_dqtrace_entry(dqp, "LOOKUP END");
*O_dqpp = dqp;
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+ ASSERT(mutex_is_locked(&qh->qh_lock));
return (0);
}
}
*O_dqpp = NULL;
- ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+ ASSERT(mutex_is_locked(&qh->qh_lock));
return (1);
}
@@ -956,7 +956,7 @@ xfs_qm_dqget(
ASSERT(ip->i_gdquot == NULL);
}
#endif
- XFS_DQ_HASH_LOCK(h);
+ mutex_lock(&h->qh_lock);
/*
* Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
*/
ASSERT(*O_dqpp);
ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
- XFS_DQ_HASH_UNLOCK(h);
+ mutex_unlock(&h->qh_lock);
xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
return (0); /* success */
}
@@ -991,7 +991,7 @@ xfs_qm_dqget(
* we don't keep the lock across a disk read
*/
version = h->qh_version;
- XFS_DQ_HASH_UNLOCK(h);
+ mutex_unlock(&h->qh_lock);
/*
* Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
/*
* Hashlock comes after ilock in lock order
*/
- XFS_DQ_HASH_LOCK(h);
+ mutex_lock(&h->qh_lock);
if (version != h->qh_version) {
xfs_dquot_t *tmpdqp;
/*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
* and start over.
*/
xfs_qm_dqput(tmpdqp);
- XFS_DQ_HASH_UNLOCK(h);
+ mutex_unlock(&h->qh_lock);
xfs_qm_dqdestroy(dqp);
XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
* Put the dquot at the beginning of the hash-chain and mp's list
* LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
*/
- ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+ ASSERT(mutex_is_locked(&h->qh_lock));
dqp->q_hash = h;
XQM_HASHLIST_INSERT(h, dqp);
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
xfs_qm_mplist_unlock(mp);
- XFS_DQ_HASH_UNLOCK(h);
+ mutex_unlock(&h->qh_lock);
dqret:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
xfs_mount_t *mp = dqp->q_mount;
ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
- ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+ ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
xfs_dqlock(dqp);
/*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
*/
if (dqp->q_nrefs != 0) {
xfs_dqunlock(dqp);
- XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+ mutex_unlock(&dqp->q_hash->qh_lock);
return (1);
}
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
memset(&dqp->q_core, 0, sizeof(dqp->q_core));
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- XFS_DQ_HASH_UNLOCK(thishash);
+ mutex_unlock(&thishash->qh_lock);
return (0);
}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index d443e93b433..de0f402ddb4 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -34,7 +34,7 @@
*/
typedef struct xfs_dqhash {
struct xfs_dquot *qh_next;
- mutex_t qh_lock;
+ struct mutex qh_lock;
uint qh_version; /* ever increasing version */
uint qh_nelems; /* number of dquots on the list */
} xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
- mutex_t q_qlock; /* quota lock */
+ struct mutex q_qlock; /* quota lock */
struct completion q_flush; /* flush completion queue */
atomic_t q_pincount; /* dquot pin count */
wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
-#ifdef DEBUG
-static inline int
-XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
-{
- if (mutex_trylock(&dqp->q_qlock)) {
- mutex_unlock(&dqp->q_qlock);
- return 0;
- }
- return 1;
-}
-#endif
-
-
/*
* Manage the q_flush completion queue embedded in the dquot. This completion
* queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
complete(&dqp->q_flush);
}
+#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7a2beb64314..5b6695049e0 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,7 +55,7 @@
* quota functionality, including maintaining the freelist and hash
* tables of dquots.
*/
-mutex_t xfs_Gqm_lock;
+struct mutex xfs_Gqm_lock;
struct xfs_qm *xfs_Gqm;
uint ndquot;
@@ -69,8 +69,6 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
-STATIC int xfs_qm_mplist_nowait(xfs_mount_t *);
-STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
};
#ifdef DEBUG
-extern mutex_t qcheck_lock;
+extern struct mutex qcheck_lock;
#endif
#ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
* the structure could disappear between the entry to this routine and
* a HOLD operation if not locked.
*/
- XFS_QM_LOCK(xfs_Gqm);
+ mutex_lock(&xfs_Gqm_lock);
if (xfs_Gqm == NULL)
xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
* debugging and statistical purposes, but ...
* Just take a reference and get out.
*/
- XFS_QM_HOLD(xfs_Gqm);
- XFS_QM_UNLOCK(xfs_Gqm);
+ xfs_Gqm->qm_nrefs++;
+ mutex_unlock(&xfs_Gqm_lock);
return 0;
}
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
* Destroy the entire XQM. If somebody mounts with quotaon, this'll
* be restarted.
*/
- XFS_QM_LOCK(xfs_Gqm);
- XFS_QM_RELE(xfs_Gqm);
- if (xfs_Gqm->qm_nrefs == 0) {
+ mutex_lock(&xfs_Gqm_lock);
+ if (--xfs_Gqm->qm_nrefs == 0) {
xfs_qm_destroy(xfs_Gqm);
xfs_Gqm = NULL;
}
- XFS_QM_UNLOCK(xfs_Gqm);
+ mutex_unlock(&xfs_Gqm_lock);
}
/*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
continue;
}
- if (! xfs_qm_dqhashlock_nowait(dqp)) {
+ if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
nrecl = XFS_QI_MPLRECLAIMS(mp);
xfs_qm_mplist_unlock(mp);
- XFS_DQ_HASH_LOCK(dqp->q_hash);
+ mutex_lock(&dqp->q_hash->qh_lock);
xfs_qm_mplist_lock(mp);
/*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
* this point, but somebody might be taking things off.
*/
if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
- XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+ mutex_unlock(&dqp->q_hash->qh_lock);
goto again;
}
}
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
xfs_dqid_t id,
uint type,
uint doalloc,
- uint dolock,
xfs_dquot_t *udqhint, /* hint */
xfs_dquot_t **IO_idqpp)
{
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
error = 0;
+
/*
* See if we already have it in the inode itself. IO_idqpp is
* &i_udquot or &i_gdquot. This made the code look weird, but
* made the logic a lot simpler.
*/
- if ((dqp = *IO_idqpp)) {
- if (dolock)
- xfs_dqlock(dqp);
+ dqp = *IO_idqpp;
+ if (dqp) {
xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
- goto done;
+ return 0;
}
/*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
* lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
* the user dquot.
*/
- ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
- if (udqhint && !dolock)
+ if (udqhint) {
+ ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
xfs_dqlock(udqhint);
- /*
- * No need to take dqlock to look at the id.
- * The ID can't change until it gets reclaimed, and it won't
- * be reclaimed as long as we have a ref from inode and we hold
- * the ilock.
- */
- if (udqhint &&
- (dqp = udqhint->q_gdquot) &&
- (be32_to_cpu(dqp->q_core.d_id) == id)) {
- ASSERT(XFS_DQ_IS_LOCKED(udqhint));
- xfs_dqlock(dqp);
- XFS_DQHOLD(dqp);
- ASSERT(*IO_idqpp == NULL);
- *IO_idqpp = dqp;
- if (!dolock) {
+ /*
+ * No need to take dqlock to look at the id.
+ *
+ * The ID can't change until it gets reclaimed, and it won't
+ * be reclaimed as long as we have a ref from inode and we
+ * hold the ilock.
+ */
+ dqp = udqhint->q_gdquot;
+ if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
+ xfs_dqlock(dqp);
+ XFS_DQHOLD(dqp);
+ ASSERT(*IO_idqpp == NULL);
+ *IO_idqpp = dqp;
+
xfs_dqunlock(dqp);
xfs_dqunlock(udqhint);
+ return 0;
}
- goto done;
- }
- /*
- * We can't hold a dquot lock when we call the dqget code.
- * We'll deadlock in no time, because of (not conforming to)
- * lock ordering - the inodelock comes before any dquot lock,
- * and we may drop and reacquire the ilock in xfs_qm_dqget().
- */
- if (udqhint)
+
+ /*
+ * We can't hold a dquot lock when we call the dqget code.
+ * We'll deadlock in no time, because of (not conforming to)
+ * lock ordering - the inodelock comes before any dquot lock,
+ * and we may drop and reacquire the ilock in xfs_qm_dqget().
+ */
xfs_dqunlock(udqhint);
+ }
+
/*
* Find the dquot from somewhere. This bumps the
* reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
* disk and we didn't ask it to allocate;
* ESRCH if quotas got turned off suddenly.
*/
- if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
- doalloc|XFS_QMOPT_DOWARN, &dqp))) {
- if (udqhint && dolock)
- xfs_dqlock(udqhint);
- goto done;
- }
+ error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+ if (error)
+ return error;
xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+
/*
* dqget may have dropped and re-acquired the ilock, but it guarantees
* that the dquot returned is the one that should go in the inode.
*/
*IO_idqpp = dqp;
- ASSERT(dqp);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- if (! dolock) {
- xfs_dqunlock(dqp);
- goto done;
- }
- if (! udqhint)
- goto done;
-
- ASSERT(udqhint);
- ASSERT(dolock);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- if (! xfs_qm_dqlock_nowait(udqhint)) {
- xfs_dqunlock(dqp);
- xfs_dqlock(udqhint);
- xfs_dqlock(dqp);
- }
- done:
-#ifdef QUOTADEBUG
- if (udqhint) {
- if (dolock)
- ASSERT(XFS_DQ_IS_LOCKED(udqhint));
- }
- if (! error) {
- if (dolock)
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
- }
-#endif
- return error;
+ xfs_dqunlock(dqp);
+ return 0;
}
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
STATIC void
xfs_qm_dqattach_grouphint(
xfs_dquot_t *udq,
- xfs_dquot_t *gdq,
- uint locked)
+ xfs_dquot_t *gdq)
{
xfs_dquot_t *tmp;
-#ifdef QUOTADEBUG
- if (locked) {
- ASSERT(XFS_DQ_IS_LOCKED(udq));
- ASSERT(XFS_DQ_IS_LOCKED(gdq));
- }
-#endif
- if (! locked)
- xfs_dqlock(udq);
+ xfs_dqlock(udq);
if ((tmp = udq->q_gdquot)) {
if (tmp == gdq) {
- if (! locked)
- xfs_dqunlock(udq);
+ xfs_dqunlock(udq);
return;
}
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
* because the freelist lock comes before dqlocks.
*/
xfs_dqunlock(udq);
- if (locked)
- xfs_dqunlock(gdq);
/*
* we took a hard reference once upon a time in dqget,
* so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
} else {
ASSERT(XFS_DQ_IS_LOCKED(udq));
- if (! locked) {
- xfs_dqlock(gdq);
- }
+ xfs_dqlock(gdq);
}
ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
XFS_DQHOLD(gdq);
udq->q_gdquot = gdq;
}
- if (! locked) {
- xfs_dqunlock(gdq);
- xfs_dqunlock(udq);
- }
+
+ xfs_dqunlock(gdq);
+ xfs_dqunlock(udq);
}
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
* Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
* into account.
* If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
- * much made this code a complete mess, but it has been pretty useful.
* If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
* Inode may get unlocked and relocked in here, and the caller must deal with
* the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
if (XFS_IS_UQUOTA_ON(mp)) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
flags & XFS_QMOPT_DQALLOC,
- flags & XFS_QMOPT_DQLOCK,
NULL, &ip->i_udquot);
if (error)
goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
error = XFS_IS_GQUOTA_ON(mp) ?
xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
flags & XFS_QMOPT_DQALLOC,
- flags & XFS_QMOPT_DQLOCK,
ip->i_udquot, &ip->i_gdquot) :
xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
flags & XFS_QMOPT_DQALLOC,
- flags & XFS_QMOPT_DQLOCK,
ip->i_udquot, &ip->i_gdquot);
/*
* Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
/*
* Attach i_gdquot to the gdquot hint inside the i_udquot.
*/
- xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
- flags & XFS_QMOPT_DQLOCK);
+ xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
}
done:
#ifdef QUOTADEBUG
if (! error) {
- if (ip->i_udquot) {
- if (flags & XFS_QMOPT_DQLOCK)
- ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
- }
- if (ip->i_gdquot) {
- if (flags & XFS_QMOPT_DQLOCK)
- ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
- }
if (XFS_IS_UQUOTA_ON(mp))
ASSERT(ip->i_udquot);
if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
* a dqlookup process that holds the hashlock that is
* waiting for the freelist lock.
*/
- if (! xfs_qm_dqhashlock_nowait(dqp)) {
+ if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
/* XXX put a sentinel so that we can come back here */
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- XFS_DQ_HASH_UNLOCK(hash);
+ mutex_unlock(&hash->qh_lock);
xfs_qm_freelist_unlock(xfs_Gqm);
if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
XQM_HASHLIST_REMOVE(hash, dqp);
xfs_dqfunlock(dqp);
xfs_qm_mplist_unlock(dqp->q_mount);
- XFS_DQ_HASH_UNLOCK(hash);
+ mutex_unlock(&hash->qh_lock);
off_freelist:
XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
continue;
}
- if (! xfs_qm_dqhashlock_nowait(dqp))
+ if (!mutex_trylock(&dqp->q_hash->qh_lock))
goto mplistunlock;
ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
XQM_FREELIST_REMOVE(dqp);
dqpout = dqp;
- XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+ mutex_unlock(&dqp->q_hash->qh_lock);
mplistunlock:
xfs_qm_mplist_unlock(dqp->q_mount);
xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
{
xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
}
-
-STATIC int
-xfs_qm_dqhashlock_nowait(
- xfs_dquot_t *dqp)
-{
- int locked;
-
- locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
- return locked;
-}
-
-int
-xfs_qm_freelist_lock_nowait(
- xfs_qm_t *xqm)
-{
- int locked;
-
- locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
- return locked;
-}
-
-STATIC int
-xfs_qm_mplist_nowait(
- xfs_mount_t *mp)
-{
- int locked;
-
- ASSERT(mp->m_quotainfo);
- locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
- return locked;
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index ddf09166387..a371954cae1 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -27,7 +27,7 @@ struct xfs_qm;
struct xfs_inode;
extern uint ndquot;
-extern mutex_t xfs_Gqm_lock;
+extern struct mutex xfs_Gqm_lock;
extern struct xfs_qm *xfs_Gqm;
extern kmem_zone_t *qm_dqzone;
extern kmem_zone_t *qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t xfs_dqlist_t;
typedef struct xfs_frlist {
struct xfs_dquot *qh_next;
struct xfs_dquot *qh_prev;
- mutex_t qh_lock;
+ struct mutex qh_lock;
uint qh_version;
uint qh_nelems;
} xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
- mutex_t qi_quotaofflock;/* to serialize quotaoff */
+ struct mutex qi_quotaofflock;/* to serialize quotaoff */
xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
uint qi_dqperchunk; /* # ondisk dqs in above chunk */
xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_IWARNLIMIT 5
#define XFS_QM_RTBWARNLIMIT 5
-#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock))
-#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock))
-#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++)
-#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
-
extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
extern void xfs_qm_mount_quotas(xfs_mount_t *);
extern int xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void xfs_qm_dqdetach(xfs_inode_t *);
extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+/* quota ops */
+extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+ fs_disk_quota_t *);
+extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+ fs_disk_quota_t *);
+extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+
/* vop stuff */
extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
/* list stuff */
extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
-extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *);
-
-/* system call interface */
-extern int xfs_qm_quotactl(struct xfs_mount *, int, int,
- xfs_caddr_t);
#ifdef DEBUG
extern int xfs_qm_internalqcheck(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bc6c5cca3e1..63037c689a4 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
.xfs_dqvopchownresv = xfs_qm_vop_chown_reserve,
.xfs_dqstatvfs = xfs_qm_statvfs,
.xfs_dqsync = xfs_qm_sync,
- .xfs_quotactl = xfs_qm_quotactl,
.xfs_dqtrxops = &xfs_trans_dquot_ops,
};
EXPORT_SYMBOL(xfs_qmcore_xfs);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68139b38aed..c7b66f6506c 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -57,135 +57,16 @@
# define qdprintk(s, args...) do { } while (0)
#endif
-STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
- fs_disk_quota_t *);
-STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
uint);
-STATIC uint xfs_qm_import_flags(uint);
STATIC uint xfs_qm_export_flags(uint);
-STATIC uint xfs_qm_import_qtype_flags(uint);
STATIC uint xfs_qm_export_qtype_flags(uint);
STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
fs_disk_quota_t *);
/*
- * The main distribution switch of all XFS quotactl system calls.
- */
-int
-xfs_qm_quotactl(
- xfs_mount_t *mp,
- int cmd,
- int id,
- xfs_caddr_t addr)
-{
- int error;
-
- ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
-
- /*
- * The following commands are valid even when quotaoff.
- */
- switch (cmd) {
- case Q_XQUOTARM:
- /*
- * Truncate quota files. quota must be off.
- */
- if (XFS_IS_QUOTA_ON(mp))
- return XFS_ERROR(EINVAL);
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- return (xfs_qm_scall_trunc_qfiles(mp,
- xfs_qm_import_qtype_flags(*(uint *)addr)));
-
- case Q_XGETQSTAT:
- /*
- * Get quota status information.
- */
- return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
-
- case Q_XQUOTAON:
- /*
- * QUOTAON - enabling quota enforcement.
- * Quota accounting must be turned on at mount time.
- */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- return (xfs_qm_scall_quotaon(mp,
- xfs_qm_import_flags(*(uint *)addr)));
-
- case Q_XQUOTAOFF:
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- break;
-
- case Q_XQUOTASYNC:
- return xfs_sync_inodes(mp, SYNC_DELWRI);
-
- default:
- break;
- }
-
- if (! XFS_IS_QUOTA_ON(mp))
- return XFS_ERROR(ESRCH);
-
- switch (cmd) {
- case Q_XQUOTAOFF:
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_quotaoff(mp,
- xfs_qm_import_flags(*(uint *)addr),
- B_FALSE);
- break;
-
- case Q_XGETQUOTA:
- error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XGETGQUOTA:
- error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XGETPQUOTA:
- error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
- (fs_disk_quota_t *)addr);
- break;
-
- case Q_XSETQLIM:
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XSETGQLIM:
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
- (fs_disk_quota_t *)addr);
- break;
- case Q_XSETPQLIM:
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return XFS_ERROR(EROFS);
- error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
- (fs_disk_quota_t *)addr);
- break;
-
- default:
- error = XFS_ERROR(EINVAL);
- break;
- }
-
- return (error);
-}
-
-/*
* Turn off quota accounting and/or enforcement for all udquots and/or
* gdquots. Called only at unmount time.
*
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
* incore, and modifies the ondisk dquot directly. Therefore, for example,
* it is an error to call this twice, without purging the cache.
*/
-STATIC int
+int
xfs_qm_scall_quotaoff(
xfs_mount_t *mp,
- uint flags,
- boolean_t force)
+ uint flags)
{
uint dqtype;
int error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
xfs_qoff_logitem_t *qoffstart;
int nculprits;
- if (!force && !capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
/*
* No file system can have quotas enabled on disk but not in core.
* Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
return (error);
}
-STATIC int
+int
xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
int error = 0, error2 = 0;
xfs_inode_t *qip;
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
* effect immediately.
* (Switching on quota accounting must be done at mount time.)
*/
-STATIC int
+int
xfs_qm_scall_quotaon(
xfs_mount_t *mp,
uint flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
uint accflags;
__int64_t sbflags;
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
-
flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
/*
* Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
/*
* Return quota status information, such as uquota-off, enforcements, etc.
*/
-STATIC int
+int
xfs_qm_scall_getqstat(
xfs_mount_t *mp,
fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
/*
* Adjust quota limits, and start/stop timers accordingly.
*/
-STATIC int
+int
xfs_qm_scall_setqlim(
xfs_mount_t *mp,
xfs_dqid_t id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
int error;
xfs_qcnt_t hard, soft;
- if (!capable(CAP_SYS_ADMIN))
- return XFS_ERROR(EPERM);
-
if ((newlim->d_fieldmask &
(FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
return error;
}
-STATIC int
+int
xfs_qm_scall_getquota(
xfs_mount_t *mp,
xfs_dqid_t id,
@@ -935,30 +805,6 @@ xfs_qm_export_dquot(
}
STATIC uint
-xfs_qm_import_qtype_flags(
- uint uflags)
-{
- uint oflags = 0;
-
- /*
- * Can't be more than one, or none.
- */
- if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
- (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
- ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
- (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
- ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
- (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
- ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
- return (0);
-
- oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
- oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
- oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
- return oflags;
-}
-
-STATIC uint
xfs_qm_export_qtype_flags(
uint flags)
{
@@ -979,26 +825,6 @@ xfs_qm_export_qtype_flags(
}
STATIC uint
-xfs_qm_import_flags(
- uint uflags)
-{
- uint flags = 0;
-
- if (uflags & XFS_QUOTA_UDQ_ACCT)
- flags |= XFS_UQUOTA_ACCT;
- if (uflags & XFS_QUOTA_PDQ_ACCT)
- flags |= XFS_PQUOTA_ACCT;
- if (uflags & XFS_QUOTA_GDQ_ACCT)
- flags |= XFS_GQUOTA_ACCT;
- if (uflags & XFS_QUOTA_UDQ_ENFD)
- flags |= XFS_UQUOTA_ENFD;
- if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
- flags |= XFS_OQUOTA_ENFD;
- return (flags);
-}
-
-
-STATIC uint
xfs_qm_export_flags(
uint flags)
{
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
xfs_dqhash_t *qmtest_gdqtab;
int qmtest_hashmask;
int qmtest_nfails;
-mutex_t qcheck_lock;
+struct mutex qcheck_lock;
#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
(__psunsigned_t)(id)) & \
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index c4fcea600bc..8286b2842b6 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -42,34 +42,24 @@
#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock)
#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-#define XQMLCK(h) (mutex_lock(&((h)->qh_lock)))
-#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock)))
-#ifdef DEBUG
-struct xfs_dqhash;
-static inline int XQMISLCKD(struct xfs_dqhash *h)
-{
- if (mutex_trylock(&h->qh_lock)) {
- mutex_unlock(&h->qh_lock);
- return 0;
- }
- return 1;
-}
-#endif
-
-#define XFS_DQ_HASH_LOCK(h) XQMLCK(h)
-#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h)
-#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h)
-
-#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
-
-#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist))
-#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist))
+#define xfs_qm_mplist_lock(mp) \
+ mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define xfs_qm_mplist_nowait(mp) \
+ mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define xfs_qm_mplist_unlock(mp) \
+ mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define XFS_QM_IS_MPLIST_LOCKED(mp) \
+ mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
+
+#define xfs_qm_freelist_lock(qm) \
+ mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_lock_nowait(qm) \
+ mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_unlock(qm) \
+ mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
/*
* Hash into a bucket in the dquot hash table, based on <mp, id>.
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 99611381e74..447173bcf96 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
xfs_qcnt_t *resbcountp;
xfs_quotainfo_t *q = mp->m_quotainfo;
- if (! (flags & XFS_QMOPT_DQLOCK)) {
- xfs_dqlock(dqp);
- }
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ xfs_dqlock(dqp);
+
if (flags & XFS_TRANS_DQ_RES_BLKS) {
hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
error_return:
- if (! (flags & XFS_QMOPT_DQLOCK)) {
- xfs_dqunlock(dqp);
- }
- return (error);
+ xfs_dqunlock(dqp);
+ return error;
}
@@ -753,8 +750,7 @@ error_return:
* grp/prj quotas is important, because this follows a both-or-nothing
* approach.
*
- * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
- * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
* XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota.
* XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
* XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index ae548296542..3f3610a7ee0 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -24,6 +24,7 @@
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_error.h"
static char message[1024]; /* keep it off the stack */
static DEFINE_SPINLOCK(xfs_err_lock);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 5830c040ea7..b83f76b6d41 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,10 +17,6 @@
*/
#include <xfs.h>
-static DEFINE_MUTEX(uuid_monitor);
-static int uuid_table_size;
-static uuid_t *uuid_table;
-
/* IRIX interpretation of an uuid_t */
typedef struct {
__be32 uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
fsid[1] = be32_to_cpu(uup->uu_timelow);
}
-void
-uuid_create_nil(uuid_t *uuid)
-{
- memset(uuid, 0, sizeof(*uuid));
-}
-
int
uuid_is_nil(uuid_t *uuid)
{
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
{
return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
}
-
-/*
- * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
- * 64-bit words. NOTE: This function can not be changed EVER. Although
- * brain-dead, some applications depend on this 64-bit value remaining
- * persistent. Specifically, DMI vendors store the value as a persistent
- * filehandle.
- */
-__uint64_t
-uuid_hash64(uuid_t *uuid)
-{
- __uint64_t *sp = (__uint64_t *)uuid;
-
- return sp[0] + sp[1];
-}
-
-int
-uuid_table_insert(uuid_t *uuid)
-{
- int i, hole;
-
- mutex_lock(&uuid_monitor);
- for (i = 0, hole = -1; i < uuid_table_size; i++) {
- if (uuid_is_nil(&uuid_table[i])) {
- hole = i;
- continue;
- }
- if (uuid_equal(uuid, &uuid_table[i])) {
- mutex_unlock(&uuid_monitor);
- return 0;
- }
- }
- if (hole < 0) {
- uuid_table = kmem_realloc(uuid_table,
- (uuid_table_size + 1) * sizeof(*uuid_table),
- uuid_table_size * sizeof(*uuid_table),
- KM_SLEEP);
- hole = uuid_table_size++;
- }
- uuid_table[hole] = *uuid;
- mutex_unlock(&uuid_monitor);
- return 1;
-}
-
-void
-uuid_table_remove(uuid_t *uuid)
-{
- int i;
-
- mutex_lock(&uuid_monitor);
- for (i = 0; i < uuid_table_size; i++) {
- if (uuid_is_nil(&uuid_table[i]))
- continue;
- if (!uuid_equal(uuid, &uuid_table[i]))
- continue;
- uuid_create_nil(&uuid_table[i]);
- break;
- }
- ASSERT(i < uuid_table_size);
- mutex_unlock(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index cff5b607d44..4732d71262c 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,12 +22,8 @@ typedef struct {
unsigned char __u_bits[16];
} uuid_t;
-extern void uuid_create_nil(uuid_t *uuid);
extern int uuid_is_nil(uuid_t *uuid);
extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-extern __uint64_t uuid_hash64(uuid_t *uuid);
-extern int uuid_table_insert(uuid_t *uuid);
-extern void uuid_table_remove(uuid_t *uuid);
#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 143d63ecb20..c8641f713ca 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,8 +223,8 @@ typedef struct xfs_perag
be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
#define XFS_MIN_FREELIST_PAG(pag,mp) \
(XFS_MIN_FREELIST_RAW( \
- (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
- (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
#define XFS_AGB_TO_FSB(mp,agno,agbno) \
(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 028e44e58ea..2cf944eb796 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1872,6 +1872,25 @@ xfs_alloc_compute_maxlevels(
}
/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ xfs_extlen_t need, delta = 0;
+
+ need = XFS_MIN_FREELIST_PAG(pag, mp);
+ if (need > pag->pagf_flcount)
+ delta = need - pag->pagf_flcount;
+
+ if (pag->pagf_longest > delta)
+ return pag->pagf_longest - delta;
+ return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
}
if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
/*
* If it looks like there isn't a long enough extent, or enough
* total blocks, reject it.
*/
- longest = (pag->pagf_longest > delta) ?
- (pag->pagf_longest - delta) :
- (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
+ need = XFS_MIN_FREELIST_PAG(pag, mp);
+ longest = xfs_alloc_longest_free_extent(mp, pag);
if ((args->minlen + args->alignment + args->minalignslop - 1) >
longest ||
((int)(pag->pagf_freeblks + pag->pagf_flcount -
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 588172796f7..e704caee10d 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+ struct xfs_perag *pag);
#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 6c323f8a4cd..afdc8911637 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
* minimum offset only needs to be the space required for
* the btree root.
*/
- if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset)
+ if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+ xfs_default_attroffset(dp))
dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
break;
@@ -298,6 +299,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
}
/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp)
+{
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ ip->i_d.di_forkoff = 0;
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+ ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT(ip->i_afp == NULL);
+
+ ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
* Remove an attribute from the shortform attribute list structure.
*/
int
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
*/
totsize -= size;
if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
- !(args->op_flags & XFS_DA_OP_ADDNAME) &&
- (mp->m_flags & XFS_MOUNT_ATTR2) &&
- (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
- /*
- * Last attribute now removed, revert to original
- * inode format making all literal area available
- * to the data fork once more.
- */
- xfs_idestroy_fork(dp, XFS_ATTR_FORK);
- dp->i_d.di_forkoff = 0;
- dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ASSERT(dp->i_d.di_anextents == 0);
- ASSERT(dp->i_afp == NULL);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ (mp->m_flags & XFS_MOUNT_ATTR2) &&
+ (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ xfs_attr_fork_reset(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
if (forkoff == -1) {
ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
-
- /*
- * Last attribute was removed, revert to original
- * inode format making all literal area available
- * to the data fork once more.
- */
- xfs_idestroy_fork(dp, XFS_ATTR_FORK);
- dp->i_d.di_forkoff = 0;
- dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ASSERT(dp->i_d.di_anextents == 0);
- ASSERT(dp->i_afp == NULL);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ xfs_attr_fork_reset(dp, args->trans);
goto out;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c852cd65aae..3a6ed426327 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
/*
* If allocating at eof, and there's a previous real block,
- * try to use it's last block as our starting point.
+ * try to use its last block as our starting point.
*/
if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
!isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
xfs_agnumber_t startag;
xfs_alloc_arg_t args;
xfs_extlen_t blen;
- xfs_extlen_t delta;
- xfs_extlen_t longest;
- xfs_extlen_t need;
xfs_extlen_t nextminlen = 0;
xfs_perag_t *pag;
int nullfb; /* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
* See xfs_alloc_fix_freelist...
*/
if (pag->pagf_init) {
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- delta = need > pag->pagf_flcount ?
- need - pag->pagf_flcount : 0;
- longest = (pag->pagf_longest > delta) ?
- (pag->pagf_longest - delta) :
- (pag->pagf_flcount > 0 ||
- pag->pagf_longest > 0);
+ xfs_extlen_t longest;
+ longest = xfs_alloc_longest_free_extent(mp, pag);
if (blen < longest)
blen = longest;
} else
@@ -3577,6 +3569,27 @@ xfs_bmap_extents_to_btree(
}
/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint offset;
+
+ if (mp->m_sb.sb_inodesize == 256) {
+ offset = XFS_LITINO(mp) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ } else {
+ offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+ }
+
+ ASSERT(offset < XFS_LITINO(mp));
+ return offset;
+}
+
+/*
* Helper routine to reset inode di_forkoff field when switching
* attribute fork from local to extent format - we reset it where
* possible to make space available for inline data fork extents.
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
int whichfork)
{
if (whichfork == XFS_ATTR_FORK &&
- (ip->i_d.di_format != XFS_DINODE_FMT_DEV) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_UUID) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
- ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) {
- ip->i_d.di_forkoff = mp->m_attroffset >> 3;
- ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) /
- (uint)sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) /
- (uint)sizeof(xfs_bmbt_rec_t);
+ ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+ ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+ if (dfl_forkoff > ip->i_d.di_forkoff) {
+ ip->i_d.di_forkoff = dfl_forkoff;
+ ip->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+ ip->i_afp->if_ext_max =
+ XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+ }
}
}
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
case XFS_DINODE_FMT_BTREE:
ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
else if (mp->m_flags & XFS_MOUNT_ATTR2)
version = 2;
break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
* (a signed 16-bit number, xfs_aextnum_t).
*
* Note that we can no longer assume that if we are in ATTR1 that
- * the fork offset of all the inodes will be (m_attroffset >> 3)
- * because we could have mounted with ATTR2 and then mounted back
- * with ATTR1, keeping the di_forkoff's fixed but probably at
- * various positions. Therefore, for both ATTR1 and ATTR2
- * we have to assume the worst case scenario of a minimum size
- * available.
+ * the fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+ * with ATTR2 and then mounted back with ATTR1, keeping the
+ * di_forkoff's fixed but probably at various positions. Therefore,
+ * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+ * of a minimum size available.
*/
if (whichfork == XFS_DATA_FORK) {
maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
xfs_extlen_t minlen; /* min allocation size */
xfs_mount_t *mp; /* xfs mount structure */
int n; /* current extent index */
- int nallocs; /* number of extents alloc\'d */
+ int nallocs; /* number of extents alloc'd */
xfs_extnum_t nextents; /* number of extents in file */
xfs_fileoff_t obno; /* old block number (offset) */
xfs_bmbt_irec_t prev; /* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
return(bp);
}
-void
+STATIC void
xfs_check_block(
struct xfs_btree_block *block,
xfs_mount_t *mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
block = XFS_BUF_TO_BLOCK(bp);
if (--level) {
- /* Not at node above leafs, count this level of nodes */
+ /* Not at node above leaves, count this level of nodes */
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
while (nextbno != NULLFSBLOCK) {
if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index be2979d88d3..1b8ff9256bd 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
xfs_extlen_t alen; /* i/o length asked/allocated */
xfs_extlen_t total; /* total blocks needed for xaction */
- xfs_extlen_t minlen; /* mininum allocation size (blocks) */
+ xfs_extlen_t minlen; /* minimum allocation size (blocks) */
xfs_extlen_t minleft; /* amount must be left after alloc */
char eof; /* set if allocating past last extent */
char wasdel; /* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
xfs_extnum_t idx,
xfs_extnum_t num);
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip);
+
#ifdef __KERNEL__
/*
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e73c332eb23..e9df9957482 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
/*
* We add one entry to the left side and remove one for the right side.
- * Accout for it here, the changes will be updated on disk and logged
+ * Account for it here, the changes will be updated on disk and logged
* later.
*/
lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
XFS_BTREE_STATS_INC(cur, join);
/*
- * Fix up the the number of records and right block pointer in the
+ * Fix up the number of records and right block pointer in the
* surviving block, and log it.
*/
xfs_btree_set_numrecs(left, lrecs + rrecs);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 789fffdf8b2..4f852b735b9 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -41,7 +41,7 @@ extern kmem_zone_t *xfs_btree_cur_zone;
/*
* Generic btree header.
*
- * This is a comination of the actual format used on disk for short and long
+ * This is a combination of the actual format used on disk for short and long
* format btrees. The first three fields are shared by both format, but
* the pointers are different and should be used with care.
*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c45f74ff1a5..9ff6e57a507 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* This is implemented with some source-level loop unrolling.
*/
xfs_dahash_t
-xfs_da_hashname(const uchar_t *name, int namelen)
+xfs_da_hashname(const __uint8_t *name, int namelen)
{
xfs_dahash_t hash;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 70b710c1792..8c536167bf7 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -91,9 +91,9 @@ enum xfs_dacmp {
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
- const uchar_t *name; /* string (maybe not NULL terminated) */
+ const __uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
- uchar_t *value; /* set of bytes (maybe contain NULLs) */
+ __uint8_t *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
int flags; /* argument flags (eg: ATTR_NOCREATE) */
xfs_dahash_t hashval; /* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
unsigned char inleaf; /* insert into 1->lf, 0->splf */
unsigned char extravalid; /* T/F: extrablk is in use */
unsigned char extraafter; /* T/F: extrablk is after new */
- xfs_da_state_blk_t extrablk; /* for double-splits on leafs */
+ xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
/* for dirv2 extrablk is data */
} xfs_da_state_t;
@@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
xfs_dabuf_t *dead_buf);
-uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
const char *name, int len);
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
extern struct kmem_zone *xfs_da_state_zone;
extern struct kmem_zone *xfs_dabuf_zone;
+extern const struct xfs_nameops xfs_default_nameops;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f8278cfcc1d..e6d839bddbf 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -79,6 +79,12 @@ xfs_swapext(
goto out_put_target_file;
}
+ if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
+ IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_target_file;
+ }
+
ip = XFS_I(file->f_path.dentry->d_inode);
tip = XFS_I(target_file->f_path.dentry->d_inode);
@@ -118,19 +124,17 @@ xfs_swap_extents(
xfs_bstat_t *sbp = &sxp->sx_stat;
xfs_ifork_t *tempifp, *ifp, *tifp;
int ilf_fields, tilf_fields;
- static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
int error = 0;
int aforkblks = 0;
int taforkblks = 0;
__uint64_t tmp;
- char locked = 0;
mp = ip->i_mount;
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
error = XFS_ERROR(ENOMEM);
- goto error0;
+ goto out;
}
sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
*/
xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
- locked = 1;
/* Verify that both files have the same format */
if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
error = XFS_ERROR(EINVAL);
- goto error0;
+ goto out_unlock;
}
/* Verify both files are either real-time or non-realtime */
if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
error = XFS_ERROR(EINVAL);
- goto error0;
+ goto out_unlock;
}
/* Should never get a local format */
if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
error = XFS_ERROR(EINVAL);
- goto error0;
+ goto out_unlock;
}
if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
error = xfs_flushinval_pages(tip, 0, -1,
FI_REMAPF_LOCKED);
if (error)
- goto error0;
+ goto out_unlock;
}
/* Verify O_DIRECT for ftmp */
if (VN_CACHED(VFS_I(tip)) != 0) {
error = XFS_ERROR(EINVAL);
- goto error0;
+ goto out_unlock;
}
/* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
sxp->sx_length != ip->i_d.di_size ||
sxp->sx_length != tip->i_d.di_size) {
error = XFS_ERROR(EFAULT);
- goto error0;
+ goto out_unlock;
}
/*
@@ -193,7 +196,7 @@ xfs_swap_extents(
*/
if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
error = XFS_ERROR(EINVAL);
- goto error0;
+ goto out_unlock;
}
/*
@@ -208,7 +211,7 @@ xfs_swap_extents(
(sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
(sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
error = XFS_ERROR(EBUSY);
- goto error0;
+ goto out_unlock;
}
/* We need to fail if the file is memory mapped. Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
*/
if (VN_MAPPED(VFS_I(ip))) {
error = XFS_ERROR(EBUSY);
- goto error0;
+ goto out_unlock;
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
xfs_iunlock(tip, XFS_IOLOCK_EXCL);
xfs_trans_cancel(tp, 0);
- locked = 0;
- goto error0;
+ goto out;
}
xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
@@ -253,19 +255,15 @@ xfs_swap_extents(
if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto error0;
- }
+ if (error)
+ goto out_trans_cancel;
}
if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
(tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
&taforkblks);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto error0;
- }
+ if (error)
+ goto out_trans_cancel;
}
/*
@@ -332,10 +330,10 @@ xfs_swap_extents(
IHOLD(ip);
- xfs_trans_ijoin(tp, ip, lock_flags);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
IHOLD(tip);
- xfs_trans_ijoin(tp, tip, lock_flags);
+ xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_log_inode(tp, ip, ilf_fields);
xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- }
error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
- locked = 0;
- error0:
- if (locked) {
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
- }
- if (tempifp != NULL)
- kmem_free(tempifp);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out:
+ kmem_free(tempifp);
return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
}
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 162e8726df5..e5b153b2e6a 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp) ((mp)->m_litino)
+#define XFS_LITINO(mp) \
+ ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+
#define XFS_BROOT_SIZE_ADJ \
(XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 1afb12278b8..c657bec6d95 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,8 +46,6 @@
struct xfs_name xfs_name_dotdot = {"..", 2};
-extern const struct xfs_nameops xfs_default_nameops;
-
/*
* ASCII case-insensitive (ie. A-Z) support for directories that was
* used in IRIX.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e1f0a06aaf0..ab52e9e1c1e 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
xfs_mount_t *mp; /* filesystem mount point */
char *ptr; /* current data entry */
int wantoff; /* starting block offset */
- xfs_ino_t ino;
xfs_off_t cook;
mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
(char *)dep - (char *)block);
- ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
- ino += mp->m_inoadd;
-#endif
/*
* If it didn't fit, set the final offset to here & return.
*/
if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
- ino, DT_UNKNOWN)) {
+ be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
*offset = cook & 0x7fffffff;
xfs_da_brelse(NULL, bp);
return 0;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index b816e025273..efbc290c7fe 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -38,7 +38,7 @@ struct xfs_trans;
/*
* Directory address space divided into sections,
- * spaces separated by 32gb.
+ * spaces separated by 32GB.
*/
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ef805a374ee..fa913e45944 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
* Check the internal consistency of a leaf1 block.
* Pop an assert if something is wrong.
*/
-void
+STATIC void
xfs_dir2_leaf_check(
xfs_inode_t *dp, /* incore directory inode */
xfs_dabuf_t *bp) /* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
int ra_index; /* *map index for read-ahead */
int ra_offset; /* map entry offset for ra */
int ra_want; /* readahead count wanted */
- xfs_ino_t ino;
/*
* If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
continue;
}
- /*
- * Copy the entry into the putargs, and try formatting it.
- */
dep = (xfs_dir2_data_entry_t *)ptr;
-
length = xfs_dir2_data_entsize(dep->namelen);
- ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
- ino += mp->m_inoadd;
-#endif
-
- /*
- * Won't fit. Return to caller.
- */
if (filldir(dirent, dep->name, dep->namelen,
xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
- ino, DT_UNKNOWN))
+ be64_to_cpu(dep->inumber), DT_UNKNOWN))
break;
/*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index fa6c3a5ddbc..5a81ccd1045 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
}
xfs_dir2_leafn_check(dp, bp);
/*
- * Return indication of whether this leaf block is emtpy enough
+ * Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
*rval =
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index a8a8a6efad5..e89734e8464 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
* Put . entry unless we're starting past it.
*/
if (*offset <= dot_offset) {
- ino = dp->i_ino;
-#if XFS_BIG_INUMS
- ino += mp->m_inoadd;
-#endif
- if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
+ if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
*offset = dot_offset & 0x7fffffff;
return 0;
}
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
*/
if (*offset <= dotdot_offset) {
ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
-#if XFS_BIG_INUMS
- ino += mp->m_inoadd;
-#endif
if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
*offset = dotdot_offset & 0x7fffffff;
return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
}
ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-#if XFS_BIG_INUMS
- ino += mp->m_inoadd;
-#endif
-
if (filldir(dirent, sfep->name, sfep->namelen,
off & 0x7fffffff, ino, DT_UNKNOWN)) {
*offset = off & 0x7fffffff;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2f049f63e85..0d22c56fdf6 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
* conversion routine.
*/
-#ifndef HAVE_FORMAT32
typedef struct xfs_extent_32 {
__uint64_t ext_start;
__uint32_t ext_len;
} __attribute__((packed)) xfs_extent_32_t;
-#endif
typedef struct xfs_extent_64 {
__uint64_t ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
xfs_extent_t efi_extents[1]; /* array of extents to free */
} xfs_efi_log_format_t;
-#ifndef HAVE_FORMAT32
typedef struct xfs_efi_log_format_32 {
__uint16_t efi_type; /* efi log item type */
__uint16_t efi_size; /* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
__uint64_t efi_id; /* efi identifier */
xfs_extent_32_t efi_extents[1]; /* array of extents to free */
} __attribute__((packed)) xfs_efi_log_format_32_t;
-#endif
typedef struct xfs_efi_log_format_64 {
__uint16_t efi_type; /* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
xfs_extent_t efd_extents[1]; /* array of extents freed */
} xfs_efd_log_format_t;
-#ifndef HAVE_FORMAT32
typedef struct xfs_efd_log_format_32 {
__uint16_t efd_type; /* efd log item type */
__uint16_t efd_size; /* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
__uint64_t efd_efi_id; /* id of corresponding efi */
xfs_extent_32_t efd_extents[1]; /* array of extents freed */
} __attribute__((packed)) xfs_efd_log_format_32_t;
-#endif
typedef struct xfs_efd_log_format_64 {
__uint16_t efd_type; /* efd log item type */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index f3bb75da384..6c87c8f304e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
xfs_extlen_t minlen)
{
int err, trylock, nscan;
- xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0;
+ xfs_extlen_t longest, free, minfree, maxfree = 0;
xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
struct xfs_perag *pag;
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
goto next_ag;
}
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
- longest = (pag->pagf_longest > delta) ?
- (pag->pagf_longest - delta) :
- (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
-
+ longest = xfs_alloc_longest_free_extent(mp, pag);
if (((minlen && longest >= minlen) ||
(!minlen && pag->pagf_freeblks >= minfree)) &&
(!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 680d0e0ec93..8379e3bca26 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -576,7 +576,7 @@ out:
if (fdblks_delta) {
/*
* If we are putting blocks back here, m_resblks_avail is
- * already at it's max so this will put it in the free pool.
+ * already at its max so this will put it in the free pool.
*
* If we need space, we'll either succeed in getting it
* from the free block count or we'll get an enospc. If
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab016e5ae7b..3120a3a5e20 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
/* Allow space for the inode btree to split. */
- args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+ args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
} else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
/*
* Allow space for the inode btree to split.
*/
- args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+ args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
* Initialize all inodes in this buffer and then log them.
*
* XXX: It would be much better if we had just one transaction to
- * log a whole cluster of inodes instead of all the indivdual
+ * log a whole cluster of inodes instead of all the individual
* transactions causing a lot of log traffic.
*/
xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
- XFS_INOBT_CLR_FREE(&rec, offset);
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
*/
off = agino - rec.ir_startino;
ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
- ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+ ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
/*
* Mark the inode free & increment the count.
*/
- XFS_INOBT_SET_FREE(&rec, off);
+ rec.ir_free |= XFS_INOBT_MASK(off);
rec.ir_freecount++;
/*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 99f2408e8d8..c282a9af539 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
}
/*
- * intial value of ptr for lookup
+ * initial value of ptr for lookup
*/
STATIC void
xfs_inobt_init_ptr_from_cur(
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 5580e255ff0..f782ad0c476 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -32,14 +32,14 @@ struct xfs_mount;
#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
typedef __uint64_t xfs_inofree_t;
-#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
-#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
{
- return (((n) >= XFS_INODES_PER_CHUNK ? \
- (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
+ return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
}
/*
@@ -69,20 +69,6 @@ typedef struct xfs_inobt_key {
typedef __be32 xfs_inobt_ptr_t;
/*
- * Bit manipulations for ir_free.
- */
-#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
-#define XFS_INOBT_IS_FREE(rp,i) \
- (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
-#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i))
-#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
-
-/*
- * Maximum number of inode btree levels.
- */
-#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
-
-/*
* block numbers in the AG.
*/
#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1f175fa34b2..f879c1bc4b9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
/*
* NOTE: This structure must be kept identical to struct xfs_dinode
- * in xfs_dinode.h except for the endianess annotations.
+ * in xfs_dinode.h except for the endianness annotations.
*/
typedef struct xfs_icdinode {
__uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9957d0602d5..a52ac125f05 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
__int32_t ilf_boffset; /* off of inode in buffer */
} xfs_inode_log_format_t;
-#ifndef HAVE_FORMAT32
typedef struct xfs_inode_log_format_32 {
__uint16_t ilf_type; /* inode log item type */
__uint16_t ilf_size; /* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
__int32_t ilf_len; /* len of inode buffer */
__int32_t ilf_boffset; /* off of inode in buffer */
} __attribute__((packed)) xfs_inode_log_format_32_t;
-#endif
typedef struct xfs_inode_log_format_64 {
__uint16_t ilf_type; /* inode log item type */
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ee1a0c134cc..a1cc1322fc0 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -63,7 +63,7 @@ typedef enum {
*/
typedef struct xfs_iomap {
- xfs_daddr_t iomap_bn; /* first 512b blk of mapping */
+ xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
xfs_buftarg_t *iomap_target;
xfs_off_t iomap_offset; /* offset of mapping, bytes */
xfs_off_t iomap_bsize; /* size of mapping, bytes */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf98a805ec9..aeb2d2221c7 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
- vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
+ /*
+ * We are reading the atime from the Linux inode because the
+ * dinode might not be uptodate.
+ */
+ buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
* first inode of the cluster.
*
* Careful with clustidx. There can be
- * multple clusters per chunk, a single
+ * multiple clusters per chunk, a single
* cluster per chunk or a cluster that has
* inodes represented from several different
* chunks (if blocksize is large).
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f4726f702a9..f76c6d7cea2 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -574,7 +574,7 @@ xfs_log_mount(
error = xfs_trans_ail_init(mp);
if (error) {
cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
- goto error;
+ goto out_free_log;
}
mp->m_log->l_ailp = mp->m_ail;
@@ -594,20 +594,22 @@ xfs_log_mount(
mp->m_flags |= XFS_MOUNT_RDONLY;
if (error) {
cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
- goto error;
+ goto out_destroy_ail;
}
}
/* Normal transactions can now occur */
mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
- /* End mounting message in xfs_log_mount_finish */
return 0;
-error:
- xfs_log_unmount_dealloc(mp);
+
+out_destroy_ail:
+ xfs_trans_ail_destroy(mp);
+out_free_log:
+ xlog_dealloc_log(mp->m_log);
out:
return error;
-} /* xfs_log_mount */
+}
/*
* Finish the recovery of the file system. This is separate from
@@ -633,19 +635,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
}
/*
- * Unmount processing for the log.
- */
-int
-xfs_log_unmount(xfs_mount_t *mp)
-{
- int error;
-
- error = xfs_log_unmount_write(mp);
- xfs_log_unmount_dealloc(mp);
- return error;
-}
-
-/*
* Final log writes as part of unmount.
*
* Mark the filesystem clean as unmount happens. Note that during relocation
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
* and deallocate the log as the aild references the log.
*/
void
-xfs_log_unmount_dealloc(xfs_mount_t *mp)
+xfs_log_unmount(xfs_mount_t *mp)
{
xfs_trans_ail_destroy(mp);
xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
/*
* Return size of each in-core log record buffer.
*
- * All machines get 8 x 32KB buffers by default, unless tuned otherwise.
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
*
* If the filesystem blocksize is too large, we may need to choose a
* larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
}
if (xfs_sb_version_haslogv2(&mp->m_sb)) {
- /* # headers = size / 32K
- * one header holds cycles from 32K of data
+ /* # headers = size / 32k
+ * one header holds cycles from 32k of data
*/
xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
goto done;
}
- /* All machines use 32KB buffers by default. */
+ /* All machines use 32kB buffers by default. */
log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp,
log->l_iclog_hsize = BBSIZE;
log->l_iclog_heads = 1;
- /*
- * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use
- * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers.
- */
- if (mp->m_sb.sb_blocksize >= 16*1024) {
- log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
- log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
- if (mp->m_logbufs <= 0) {
- switch (mp->m_sb.sb_blocksize) {
- case 16*1024: /* 16 KB */
- log->l_iclog_bufs = 3;
- break;
- case 32*1024: /* 32 KB */
- log->l_iclog_bufs = 4;
- break;
- case 64*1024: /* 64 KB */
- log->l_iclog_bufs = 8;
- break;
- default:
- xlog_panic("XFS: Invalid blocksize");
- break;
- }
- }
- }
-
-done: /* are we being asked to make the sizes selected above visible? */
+done:
+ /* are we being asked to make the sizes selected above visible? */
if (mp->m_logbufs == 0)
mp->m_logbufs = log->l_iclog_bufs;
if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
*/
/*
- * Free a used ticket when it's refcount falls to zero.
+ * Free a used ticket when its refcount falls to zero.
*/
void
xfs_log_ticket_put(
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 8a3e84e900a..d0c9baa50b1 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -170,9 +170,8 @@ int xfs_log_write(struct xfs_mount *mp,
int nentries,
xfs_log_ticket_t ticket,
xfs_lsn_t *start_lsn);
-int xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_unmount_write(struct xfs_mount *mp);
-void xfs_log_unmount_dealloc(struct xfs_mount *mp);
+void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
int xfs_log_need_covered(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 654167be0ef..bcad5f4c1fd 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
int ic_size;
int ic_offset;
int ic_bwritecnt;
- ushort_t ic_state;
+ unsigned short ic_state;
char *ic_datap; /* pointer to iclog data */
#ifdef XFS_LOG_TRACE
struct ktrace *ic_trace;
@@ -455,7 +455,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log);
extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
extern void xlog_put_bp(struct xfs_buf *);
-extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
extern kmem_zone_t *xfs_log_ticket_zone;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 61af610d79b..7ba450116d4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -94,12 +94,30 @@ xlog_put_bp(
xfs_buf_free(bp);
}
+STATIC xfs_caddr_t
+xlog_align(
+ xlog_t *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ xfs_buf_t *bp)
+{
+ xfs_caddr_t ptr;
+
+ if (!log->l_sectbb_log)
+ return XFS_BUF_PTR(bp);
+
+ ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
+ ASSERT(XFS_BUF_SIZE(bp) >=
+ BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
+ return ptr;
+}
+
/*
* nbblks should be uint, but oh well. Just want to catch that 32-bit length.
*/
-int
-xlog_bread(
+STATIC int
+xlog_bread_noalign(
xlog_t *log,
xfs_daddr_t blk_no,
int nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
return error;
}
+STATIC int
+xlog_bread(
+ xlog_t *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ xfs_buf_t *bp,
+ xfs_caddr_t *offset)
+{
+ int error;
+
+ error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+ if (error)
+ return error;
+
+ *offset = xlog_align(log, blk_no, nbblks, bp);
+ return 0;
+}
+
/*
* Write out the buffer at the given block for the given number of blocks.
* The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
return error;
}
-STATIC xfs_caddr_t
-xlog_align(
- xlog_t *log,
- xfs_daddr_t blk_no,
- int nbblks,
- xfs_buf_t *bp)
-{
- xfs_caddr_t ptr;
-
- if (!log->l_sectbb_log)
- return XFS_BUF_PTR(bp);
-
- ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
- ASSERT(XFS_BUF_SIZE(bp) >=
- BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
- return ptr;
-}
-
#ifdef DEBUG
/*
* dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
for (b = 0; b < 16; b++)
- cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
+ cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
cmn_err(CE_DEBUG, " log : uuid = ");
for (b = 0; b < 16; b++)
- cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
+ cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
}
#else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
mid_blk = BLK_AVG(first_blk, *last_blk);
while (mid_blk != first_blk && mid_blk != *last_blk) {
- if ((error = xlog_bread(log, mid_blk, 1, bp)))
+ error = xlog_bread(log, mid_blk, 1, bp, &offset);
+ if (error)
return error;
- offset = xlog_align(log, mid_blk, 1, bp);
mid_cycle = xlog_get_cycle(offset);
if (mid_cycle == cycle) {
*last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
bcount = min(bufblks, (start_blk + nbblks - i));
- if ((error = xlog_bread(log, i, bcount, bp)))
+ error = xlog_bread(log, i, bcount, bp, &buf);
+ if (error)
goto out;
- buf = xlog_align(log, i, bcount, bp);
for (j = 0; j < bcount; j++) {
cycle = xlog_get_cycle(buf);
if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
return ENOMEM;
smallmem = 1;
} else {
- if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+ error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+ if (error)
goto out;
- offset = xlog_align(log, start_blk, num_blks, bp);
offset += ((num_blks - 1) << BBSHIFT);
}
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
}
if (smallmem) {
- if ((error = xlog_bread(log, i, 1, bp)))
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
goto out;
- offset = xlog_align(log, i, 1, bp);
}
head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
bp = xlog_get_bp(log, 1);
if (!bp)
return ENOMEM;
- if ((error = xlog_bread(log, 0, 1, bp)))
+
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
goto bp_err;
- offset = xlog_align(log, 0, 1, bp);
+
first_half_cycle = xlog_get_cycle(offset);
last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
- if ((error = xlog_bread(log, last_blk, 1, bp)))
+ error = xlog_bread(log, last_blk, 1, bp, &offset);
+ if (error)
goto bp_err;
- offset = xlog_align(log, last_blk, 1, bp);
+
last_half_cycle = xlog_get_cycle(offset);
ASSERT(last_half_cycle != 0);
@@ -817,9 +838,10 @@ xlog_find_tail(
if (!bp)
return ENOMEM;
if (*head_blk == 0) { /* special case */
- if ((error = xlog_bread(log, 0, 1, bp)))
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
goto bread_err;
- offset = xlog_align(log, 0, 1, bp);
+
if (xlog_get_cycle(offset) == 0) {
*tail_blk = 0;
/* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
*/
ASSERT(*head_blk < INT_MAX);
for (i = (int)(*head_blk) - 1; i >= 0; i--) {
- if ((error = xlog_bread(log, i, 1, bp)))
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
goto bread_err;
- offset = xlog_align(log, i, 1, bp);
+
if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
found = 1;
break;
@@ -848,9 +871,10 @@ xlog_find_tail(
*/
if (!found) {
for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
- if ((error = xlog_bread(log, i, 1, bp)))
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
goto bread_err;
- offset = xlog_align(log, i, 1, bp);
+
if (XLOG_HEADER_MAGIC_NUM ==
be32_to_cpu(*(__be32 *)offset)) {
found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
umount_data_blk = (i + hblks) % log->l_logBBsize;
- if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+ error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ if (error)
goto bread_err;
- }
- offset = xlog_align(log, umount_data_blk, 1, bp);
+
op_head = (xlog_op_header_t *)offset;
if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
/*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
bp = xlog_get_bp(log, 1);
if (!bp)
return ENOMEM;
- if ((error = xlog_bread(log, 0, 1, bp)))
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
goto bp_err;
- offset = xlog_align(log, 0, 1, bp);
+
first_cycle = xlog_get_cycle(offset);
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
}
/* check partially zeroed log */
- if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+ error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+ if (error)
goto bp_err;
- offset = xlog_align(log, log_bbnum-1, 1, bp);
+
last_cycle = xlog_get_cycle(offset);
if (last_cycle != 0) { /* log completely written to */
xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
*/
balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
if (balign != start_block) {
- if ((error = xlog_bread(log, start_block, 1, bp))) {
- xlog_put_bp(bp);
- return error;
- }
+ error = xlog_bread_noalign(log, start_block, 1, bp);
+ if (error)
+ goto out_put_bp;
+
j = start_block - balign;
}
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
balign = BBTOB(ealign - start_block);
error = XFS_BUF_SET_PTR(bp, offset + balign,
BBTOB(sectbb));
- if (!error)
- error = xlog_bread(log, ealign, sectbb, bp);
- if (!error)
- error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+ if (error)
+ break;
+
+ error = xlog_bread_noalign(log, ealign, sectbb, bp);
+ if (error)
+ break;
+
+ error = XFS_BUF_SET_PTR(bp, offset, bufblks);
if (error)
break;
}
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
start_block += endcount;
j = 0;
}
+
+ out_put_bp:
xlog_put_bp(bp);
return error;
}
@@ -2511,16 +2543,10 @@ xlog_recover_do_inode_trans(
}
write_inode_buffer:
- if (ITEM_TYPE(item) == XFS_LI_INODE) {
- ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
- bp->b_mount = mp;
- XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
- xfs_bdwrite(mp, bp);
- } else {
- XFS_BUF_STALE(bp);
- error = xfs_bwrite(mp, bp);
- }
-
+ ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+ bp->b_mount = mp;
+ XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+ xfs_bdwrite(mp, bp);
error:
if (need_free)
kmem_free(in_f);
@@ -2769,51 +2795,48 @@ xlog_recover_do_trans(
int error = 0;
xlog_recover_item_t *item, *first_item;
- if ((error = xlog_recover_reorder_trans(trans)))
+ error = xlog_recover_reorder_trans(trans);
+ if (error)
return error;
+
first_item = item = trans->r_itemq;
do {
- /*
- * we don't need to worry about the block number being
- * truncated in > 1 TB buffers because in user-land,
- * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
- * the blknos will get through the user-mode buffer
- * cache properly. The only bad case is o32 kernels
- * where xfs_daddr_t is 32-bits but mount will warn us
- * off a > 1 TB filesystem before we get here.
- */
- if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
- if ((error = xlog_recover_do_buffer_trans(log, item,
- pass)))
- break;
- } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
- if ((error = xlog_recover_do_inode_trans(log, item,
- pass)))
- break;
- } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
- if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
- pass)))
- break;
- } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ error = xlog_recover_do_buffer_trans(log, item, pass);
+ break;
+ case XFS_LI_INODE:
+ error = xlog_recover_do_inode_trans(log, item, pass);
+ break;
+ case XFS_LI_EFI:
+ error = xlog_recover_do_efi_trans(log, item,
+ trans->r_lsn, pass);
+ break;
+ case XFS_LI_EFD:
xlog_recover_do_efd_trans(log, item, pass);
- } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
- if ((error = xlog_recover_do_dquot_trans(log, item,
- pass)))
- break;
- } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
- if ((error = xlog_recover_do_quotaoff_trans(log, item,
- pass)))
- break;
- } else {
- xlog_warn("XFS: xlog_recover_do_trans");
+ error = 0;
+ break;
+ case XFS_LI_DQUOT:
+ error = xlog_recover_do_dquot_trans(log, item, pass);
+ break;
+ case XFS_LI_QUOTAOFF:
+ error = xlog_recover_do_quotaoff_trans(log, item,
+ pass);
+ break;
+ default:
+ xlog_warn(
+ "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
ASSERT(0);
error = XFS_ERROR(EIO);
break;
}
+
+ if (error)
+ return error;
item = item->ri_next;
} while (first_item != item);
- return error;
+ return 0;
}
/*
@@ -3490,9 +3513,11 @@ xlog_do_recovery_pass(
hbp = xlog_get_bp(log, 1);
if (!hbp)
return ENOMEM;
- if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+
+ error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+ if (error)
goto bread_err1;
- offset = xlog_align(log, tail_blk, 1, hbp);
+
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead, tail_blk);
if (error)
@@ -3526,9 +3551,10 @@ xlog_do_recovery_pass(
memset(rhash, 0, sizeof(rhash));
if (tail_blk <= head_blk) {
for (blk_no = tail_blk; blk_no < head_blk; ) {
- if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+ error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+ if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no, hblks, hbp);
+
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead, blk_no);
if (error)
@@ -3536,10 +3562,11 @@ xlog_do_recovery_pass(
/* blocks in data section */
bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
- error = xlog_bread(log, blk_no + hblks, bblks, dbp);
+ error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+ &offset);
if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no + hblks, bblks, dbp);
+
xlog_unpack_data(rhead, offset, log);
if ((error = xlog_recover_process_data(log,
rhash, rhead, offset, pass)))
@@ -3562,10 +3589,10 @@ xlog_do_recovery_pass(
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
/* Read header in one read */
- error = xlog_bread(log, blk_no, hblks, hbp);
+ error = xlog_bread(log, blk_no, hblks, hbp,
+ &offset);
if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no, hblks, hbp);
} else {
/* This LR is split across physical log end */
if (blk_no != log->l_logBBsize) {
@@ -3573,12 +3600,13 @@ xlog_do_recovery_pass(
ASSERT(blk_no <= INT_MAX);
split_hblks = log->l_logBBsize - (int)blk_no;
ASSERT(split_hblks > 0);
- if ((error = xlog_bread(log, blk_no,
- split_hblks, hbp)))
+ error = xlog_bread(log, blk_no,
+ split_hblks, hbp,
+ &offset);
+ if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no,
- split_hblks, hbp);
}
+
/*
* Note: this black magic still works with
* large sector sizes (non-512) only because:
@@ -3596,14 +3624,19 @@ xlog_do_recovery_pass(
error = XFS_BUF_SET_PTR(hbp,
bufaddr + BBTOB(split_hblks),
BBTOB(hblks - split_hblks));
- if (!error)
- error = xlog_bread(log, 0,
- wrapped_hblks, hbp);
- if (!error)
- error = XFS_BUF_SET_PTR(hbp, bufaddr,
+ if (error)
+ goto bread_err2;
+
+ error = xlog_bread_noalign(log, 0,
+ wrapped_hblks, hbp);
+ if (error)
+ goto bread_err2;
+
+ error = XFS_BUF_SET_PTR(hbp, bufaddr,
BBTOB(hblks));
if (error)
goto bread_err2;
+
if (!offset)
offset = xlog_align(log, 0,
wrapped_hblks, hbp);
@@ -3619,10 +3652,10 @@ xlog_do_recovery_pass(
/* Read in data for log record */
if (blk_no + bblks <= log->l_logBBsize) {
- error = xlog_bread(log, blk_no, bblks, dbp);
+ error = xlog_bread(log, blk_no, bblks, dbp,
+ &offset);
if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no, bblks, dbp);
} else {
/* This log record is split across the
* physical end of log */
@@ -3636,12 +3669,13 @@ xlog_do_recovery_pass(
split_bblks =
log->l_logBBsize - (int)blk_no;
ASSERT(split_bblks > 0);
- if ((error = xlog_bread(log, blk_no,
- split_bblks, dbp)))
+ error = xlog_bread(log, blk_no,
+ split_bblks, dbp,
+ &offset);
+ if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no,
- split_bblks, dbp);
}
+
/*
* Note: this black magic still works with
* large sector sizes (non-512) only because:
@@ -3658,15 +3692,19 @@ xlog_do_recovery_pass(
error = XFS_BUF_SET_PTR(dbp,
bufaddr + BBTOB(split_bblks),
BBTOB(bblks - split_bblks));
- if (!error)
- error = xlog_bread(log, wrapped_hblks,
- bblks - split_bblks,
- dbp);
- if (!error)
- error = XFS_BUF_SET_PTR(dbp, bufaddr,
- h_size);
if (error)
goto bread_err2;
+
+ error = xlog_bread_noalign(log, wrapped_hblks,
+ bblks - split_bblks,
+ dbp);
+ if (error)
+ goto bread_err2;
+
+ error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+ if (error)
+ goto bread_err2;
+
if (!offset)
offset = xlog_align(log, wrapped_hblks,
bblks - split_bblks, dbp);
@@ -3683,17 +3721,21 @@ xlog_do_recovery_pass(
/* read first part of physical log */
while (blk_no < head_blk) {
- if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+ error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+ if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no, hblks, hbp);
+
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead, blk_no);
if (error)
goto bread_err2;
+
bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
- if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+ error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+ &offset);
+ if (error)
goto bread_err2;
- offset = xlog_align(log, blk_no+hblks, bblks, dbp);
+
xlog_unpack_data(rhead, offset, log);
if ((error = xlog_recover_process_data(log, rhash,
rhead, offset, pass)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 35300250e86..b101990df02 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
#include "xfs_fsops.h"
#include "xfs_utils.h"
-STATIC int xfs_uuid_mount(xfs_mount_t *);
STATIC void xfs_unmountfs_wait(xfs_mount_t *);
@@ -121,6 +120,84 @@ static const struct {
{ sizeof(xfs_sb_t), 0 }
};
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+ struct xfs_mount *mp)
+{
+ uuid_t *uuid = &mp->m_sb.sb_uuid;
+ int hole, i;
+
+ if (mp->m_flags & XFS_MOUNT_NOUUID)
+ return 0;
+
+ if (uuid_is_nil(uuid)) {
+ cmn_err(CE_WARN,
+ "XFS: Filesystem %s has nil UUID - can't mount",
+ mp->m_fsname);
+ return XFS_ERROR(EINVAL);
+ }
+
+ mutex_lock(&xfs_uuid_table_mutex);
+ for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+ if (uuid_is_nil(&xfs_uuid_table[i])) {
+ hole = i;
+ continue;
+ }
+ if (uuid_equal(uuid, &xfs_uuid_table[i]))
+ goto out_duplicate;
+ }
+
+ if (hole < 0) {
+ xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+ (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+ xfs_uuid_table_size * sizeof(*xfs_uuid_table),
+ KM_SLEEP);
+ hole = xfs_uuid_table_size++;
+ }
+ xfs_uuid_table[hole] = *uuid;
+ mutex_unlock(&xfs_uuid_table_mutex);
+
+ return 0;
+
+ out_duplicate:
+ mutex_unlock(&xfs_uuid_table_mutex);
+ cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+ mp->m_fsname);
+ return XFS_ERROR(EINVAL);
+}
+
+STATIC void
+xfs_uuid_unmount(
+ struct xfs_mount *mp)
+{
+ uuid_t *uuid = &mp->m_sb.sb_uuid;
+ int i;
+
+ if (mp->m_flags & XFS_MOUNT_NOUUID)
+ return;
+
+ mutex_lock(&xfs_uuid_table_mutex);
+ for (i = 0; i < xfs_uuid_table_size; i++) {
+ if (uuid_is_nil(&xfs_uuid_table[i]))
+ continue;
+ if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+ continue;
+ memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+ break;
+ }
+ ASSERT(i < xfs_uuid_table_size);
+ mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+
/*
* Free up the resources associated with a mount structure. Assume that
* the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
return XFS_ERROR(ENOSYS);
}
+ /*
+ * Currently only very few inode sizes are supported.
+ */
+ switch (sbp->sb_inodesize) {
+ case 256:
+ case 512:
+ case 1024:
+ case 2048:
+ break;
+ default:
+ xfs_fs_mount_cmn_err(flags,
+ "inode size of %d bytes not supported",
+ sbp->sb_inodesize);
+ return XFS_ERROR(ENOSYS);
+ }
+
if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
- mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
- /*
- * Setup for attributes, in case they get created.
- * This value is for inodes getting attributes for the first time,
- * the per-inode value is for old attribute values.
- */
- ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
- switch (sbp->sb_inodesize) {
- case 256:
- mp->m_attroffset = XFS_LITINO(mp) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- break;
- case 512:
- case 1024:
- case 2048:
- mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- break;
- default:
- ASSERT(0);
- }
- ASSERT(mp->m_attroffset < XFS_LITINO(mp));
-
mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
for (index = 0; index < agcount; index++) {
/*
* read the agf, then the agi. This gets us
- * all the inforamtion we need and populates the
+ * all the information we need and populates the
* per-ag structures for us.
*/
error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
}
/*
- * xfs_mountfs
- *
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
* - if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
xfs_inode_t *rip;
__uint64_t resblks;
uint quotamount, quotaflags;
- int uuid_mounted = 0;
int error = 0;
xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
*/
error = xfs_update_alignment(mp);
if (error)
- goto error1;
+ goto out;
xfs_alloc_compute_maxlevels(mp);
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
- /*
- * XFS uses the uuid from the superblock as the unique
- * identifier for fsid. We can not use the uuid from the volume
- * since a single partition filesystem is identical to a single
- * partition volume/filesystem.
- */
- if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
- if (xfs_uuid_mount(mp)) {
- error = XFS_ERROR(EINVAL);
- goto error1;
- }
- uuid_mounted=1;
- }
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out;
/*
* Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
*/
error = xfs_check_sizes(mp);
if (error)
- goto error1;
+ goto out_remove_uuid;
/*
* Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
error = xfs_rtmount_init(mp);
if (error) {
cmn_err(CE_WARN, "XFS: RT mount failed");
- goto error1;
+ goto out_remove_uuid;
}
/*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
KM_MAYFAIL);
if (!mp->m_perag)
- goto error1;
+ goto out_remove_uuid;
mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
+ if (!sbp->sb_logblocks) {
+ cmn_err(CE_WARN, "XFS: no log defined");
+ XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out_free_perag;
+ }
+
/*
* log's mount-time initialization. Perform 1st part recovery if needed
*/
- if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */
- error = xfs_log_mount(mp, mp->m_logdev_targp,
- XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
- XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
- if (error) {
- cmn_err(CE_WARN, "XFS: log mount failed");
- goto error2;
- }
- } else { /* No log has been defined */
- cmn_err(CE_WARN, "XFS: no log defined");
- XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
- error = XFS_ERROR(EFSCORRUPTED);
- goto error2;
+ error = xfs_log_mount(mp, mp->m_logdev_targp,
+ XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+ XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+ if (error) {
+ cmn_err(CE_WARN, "XFS: log mount failed");
+ goto out_free_perag;
}
/*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
* If we are currently making the filesystem, the initialisation will
* fail as the perag data is in an undefined state.
*/
-
if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
!XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
!mp->m_sb.sb_inprogress) {
error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
- if (error) {
- goto error2;
- }
+ if (error)
+ goto out_free_perag;
}
+
/*
* Get and sanity-check the root inode.
* Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
if (error) {
cmn_err(CE_WARN, "XFS: failed to read root inode");
- goto error3;
+ goto out_log_dealloc;
}
ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
mp);
error = XFS_ERROR(EFSCORRUPTED);
- goto error4;
+ goto out_rele_rip;
}
mp->m_rootip = rip; /* save it */
@@ -1131,7 +1188,7 @@ xfs_mountfs(
* Free up the root inode.
*/
cmn_err(CE_WARN, "XFS: failed to read RT inodes");
- goto error4;
+ goto out_rele_rip;
}
/*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
error = xfs_mount_log_sb(mp, mp->m_update_flags);
if (error) {
cmn_err(CE_WARN, "XFS: failed to write sb changes");
- goto error4;
+ goto out_rtunmount;
}
}
@@ -1152,7 +1209,7 @@ xfs_mountfs(
*/
error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
if (error)
- goto error4;
+ goto out_rtunmount;
/*
* Finish recovering the file system. This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
error = xfs_log_mount_finish(mp);
if (error) {
cmn_err(CE_WARN, "XFS: log mount finish failed");
- goto error4;
+ goto out_rtunmount;
}
/*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
*/
error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
if (error)
- goto error4;
+ goto out_rtunmount;
/*
* Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
return 0;
- error4:
- /*
- * Free up the root inode.
- */
+ out_rtunmount:
+ xfs_rtunmount_inodes(mp);
+ out_rele_rip:
IRELE(rip);
- error3:
- xfs_log_unmount_dealloc(mp);
- error2:
+ out_log_dealloc:
+ xfs_log_unmount(mp);
+ out_free_perag:
xfs_free_perag(mp);
- error1:
- if (uuid_mounted)
- uuid_table_remove(&mp->m_sb.sb_uuid);
+ out_remove_uuid:
+ xfs_uuid_unmount(mp);
+ out:
return error;
}
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
*/
XFS_QM_UNMOUNT(mp);
- if (mp->m_rbmip)
- IRELE(mp->m_rbmip);
- if (mp->m_rsumip)
- IRELE(mp->m_rsumip);
+ xfs_rtunmount_inodes(mp);
IRELE(mp->m_rootip);
/*
* We can potentially deadlock here if we have an inode cluster
- * that has been freed has it's buffer still pinned in memory because
+ * that has been freed has its buffer still pinned in memory because
* the transaction is still sitting in a iclog. The stale inodes
* on that buffer will have their flush locks held until the
* transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
* Unreserve any blocks we have so that when we unmount we don't account
* the reserved free space as used. This is really only necessary for
* lazy superblock counting because it trusts the incore superblock
- * counters to be aboslutely correct on clean unmount.
+ * counters to be absolutely correct on clean unmount.
*
* We don't bother correcting this elsewhere for lazy superblock
* counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
"Freespace may not be correct on next mount.");
xfs_unmountfs_writesb(mp);
xfs_unmountfs_wait(mp); /* wait for async bufs */
- xfs_log_unmount(mp); /* Done! No more fs ops. */
-
- if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
- uuid_table_remove(&mp->m_sb.sb_uuid);
+ xfs_log_unmount_write(mp);
+ xfs_log_unmount(mp);
+ xfs_uuid_unmount(mp);
#if defined(DEBUG)
xfs_errortag_clearall(mp, 0);
@@ -1793,29 +1845,6 @@ xfs_freesb(
}
/*
- * See if the UUID is unique among mounted XFS filesystems.
- * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
- */
-STATIC int
-xfs_uuid_mount(
- xfs_mount_t *mp)
-{
- if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
- cmn_err(CE_WARN,
- "XFS: Filesystem %s has nil UUID - can't mount",
- mp->m_fsname);
- return -1;
- }
- if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
- cmn_err(CE_WARN,
- "XFS: Filesystem %s has duplicate UUID - can't mount",
- mp->m_fsname);
- return -1;
- }
- return 0;
-}
-
-/*
* Used to log changes to the superblock unit and width fields which could
* be altered by the mount options, as well as any potential sb_features2
* fixup. Only the first superblock is updated.
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
* we disable the per-cpu counter and go through the slow path.
*
* The slow path is the current xfs_mod_incore_sb() function. This means that
- * when we disable a per-cpu counter, we need to drain it's resources back to
+ * when we disable a per-cpu counter, we need to drain its resources back to
* the global superblock. We do this after disabling the counter to prevent
* more threads from queueing up on the counter.
*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f5e9937f9bd..7af44adffc8 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
struct xfs_dquot *, struct xfs_dquot *, uint);
typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
-typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
typedef struct xfs_qmops {
xfs_qminit_t xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
xfs_dqvopchownresv_t xfs_dqvopchownresv;
xfs_dqstatvfs_t xfs_dqstatvfs;
xfs_dqsync_t xfs_dqsync;
- xfs_quotactl_t xfs_quotactl;
struct xfs_dqtrxops *xfs_dqtrxops;
} xfs_qmops_t;
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
(*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
#define XFS_QM_DQSYNC(mp, flags) \
(*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
-#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
- (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
#ifdef HAVE_PERCPU_SB
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
uint m_inobt_mnr[2]; /* min inobt btree records */
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
- uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */
+ uint m_in_maxlevels; /* max inobt btree levels. */
struct xfs_perag *m_perag; /* per-ag accounting info */
struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */
- uint m_attroffset; /* inode attribute offset */
uint m_dir_node_ents; /* #entries in a dir danode */
uint m_attr_node_ents; /* #entries in attr danode */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
- int m_litino; /* size of inode union area */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
uint m_qflags; /* quota status flags */
xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
__uint64_t m_maxioffset; /* maximum inode offset */
__uint64_t m_resblks; /* total reserved blocks */
__uint64_t m_resblks_avail;/* available reserved blocks */
-#if XFS_BIG_INUMS
- xfs_ino_t m_inoadd; /* add value for ino64_offset */
-#endif
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
-#define XFS_MOUNT_INO64 (1ULL << 1)
#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
* Synchronous read and write sizes. This should be
* better for NFSv2 wsync filesystems.
*/
-#define XFS_WSYNC_READIO_LOG 15 /* 32K */
-#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */
+#define XFS_WSYNC_READIO_LOG 15 /* 32k */
+#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
/*
* Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
int64_t msb_delta; /* Change to make to specified field */
} xfs_mod_sb_t;
-#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
-#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
-
extern int xfs_log_sbcount(xfs_mount_t *, uint);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 27f80581520..e101790ea8e 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
.xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr,
.xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval,
.xfs_dqsync = (xfs_dqsync_t) fs_noerr,
- .xfs_quotactl = (xfs_quotactl_t) fs_nosys,
};
int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 48965ecaa15..f5d1202dde2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,6 +18,8 @@
#ifndef __XFS_QUOTA_H__
#define __XFS_QUOTA_H__
+struct xfs_trans;
+
/*
* The ondisk form of a dquot structure.
*/
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */
#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c5bb86f3ec0..385f6dceba5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
return 0;
}
+void
+xfs_rtunmount_inodes(
+ struct xfs_mount *mp)
+{
+ if (mp->m_rbmip)
+ IRELE(mp->m_rbmip);
+ if (mp->m_rsumip)
+ IRELE(mp->m_rsumip);
+}
+
/*
* Pick an extent for allocation at the start of a new realtime file.
* Use the sequence number stored in the atime field of the bitmap inode.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 8d8dcd21571..b2d67adb6a0 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,8 +23,8 @@ struct xfs_trans;
/* Min and max rt extent sizes, specified in bytes */
#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
-#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */
-#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */
+#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
+#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
/*
* Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
int /* error */
xfs_rtmount_init(
struct xfs_mount *mp); /* file system mount structure */
+void
+xfs_rtunmount_inodes(
+ struct xfs_mount *mp);
/*
* Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
#endif /* CONFIG_XFS_RT */
#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index d6fe4a88d79..775249a54f6 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
* In a write transaction we can allocate a maximum of 2
* extents. This gives:
* the inode getting the new extents: inode size
- * the inode\'s bmap btree: max depth * block size
+ * the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
/*
* In truncating a file we free up to two extents at once. We can modify:
* the inode being truncated: inode size
- * the inode\'s bmap btree: (max depth + 1) * block size
+ * the inode's bmap btree: (max depth + 1) * block size
* And the bmap_finish transaction can free the blocks and bmap blocks:
* the agf for each of the ags: 4 * sector size
* the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
(128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
(128 * 5) + \
XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+ (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
* the new inode: inode size
* the inode btree entry: 1 block
* the directory btree: (max depth + v2) * dir block size
- * the directory inode\'s bmap btree: (max depth + v2) * block size
- * the blocks for the symlink: 1 KB
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * the blocks for the symlink: 1 kB
* Or in the first xact we allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
* the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
(128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
(2 * (mp)->m_sb.sb_sectsize + \
XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
- XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+ XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+ (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
* the inode btree entry: block size
* the superblock for the nlink flag: sector size
* the directory btree: (max depth + v2) * dir block size
- * the directory inode\'s bmap btree: (max depth + v2) * block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
* Or in the first xact we allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
* the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
(128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
(3 * (mp)->m_sb.sb_sectsize + \
XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
- XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+ XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+ (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
(128 * 5) + \
XFS_ALLOCFREE_LOG_RES(mp, 1) + \
- (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+ (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
/*
* Removing the attribute fork of a file
* the inode being truncated: inode size
- * the inode\'s bmap btree: max depth * block size
+ * the inode's bmap btree: max depth * block size
* And the bmap_finish transaction can free the blocks and bmap blocks:
* the agf for each of the ags: 4 * sector size
* the agfl for each of the ags: 4 * sector size
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2d47f10f8be..f31271c30de 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
* the push is run asynchronously in a separate thread, so we return the tail
* of the log right now instead of the tail after the push. This means we will
* either continue right away, or we will sleep waiting on the async thread to
- * do it's work.
+ * do its work.
*
* We do this unlocked - we only need to know whether there is anything in the
* AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
/*
* Now that the traversal is complete, we need to remove the cursor
* from the list of traversing cursors. Avoid removing the embedded
- * push cursor, but use the fact it is alway present to make the
+ * push cursor, but use the fact it is always present to make the
* list deletion simple.
*/
void
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index e110bf57d7f..eb3fc57f9ee 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,7 +22,7 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+/* XXX: from here down needed until struct xfs_trans has its own ailp */
#include "xfs_bit.h"
#include "xfs_buf_item.h"
#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 4ea2e5074bd..7d2c920dfb9 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
#define XFS_DIRREMOVE_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
- (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+ (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
/*
* Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b2f724502f1..d725428c9df 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -21,14 +21,6 @@
#ifdef __KERNEL__
/*
- * POSIX Extensions
- */
-typedef unsigned char uchar_t;
-typedef unsigned short ushort_t;
-typedef unsigned int uint_t;
-typedef unsigned long ulong_t;
-
-/*
* Additional type declarations for XFS
*/
typedef signed char __int8_t;
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fcc2285d03e..79b9e5ea535 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -374,7 +374,7 @@ xfs_truncate_file(
/*
* Follow the normal truncate locking protocol. Since we
- * hold the inode in the transaction, we know that it's number
+ * hold the inode in the transaction, we know that its number
* of references will stay constant.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0e55c5d7db5..7394c7af5de 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1136,7 +1136,7 @@ xfs_inactive(
* If the inode is already free, then there can be nothing
* to clean up here.
*/
- if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
+ if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
ASSERT(ip->i_df.if_real_bytes == 0);
ASSERT(ip->i_df.if_broot_bytes == 0);
return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
xfs_inode_t **ipp,
cred_t *credp)
{
- xfs_mount_t *mp = dp->i_mount;
- xfs_inode_t *ip;
- xfs_trans_t *tp;
+ int is_dir = S_ISDIR(mode);
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
boolean_t unlock_dp_on_error = B_FALSE;
- int dm_event_sent = 0;
uint cancel_flags;
int committed;
xfs_prid_t prid;
- struct xfs_dquot *udqp, *gdqp;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
uint resblks;
+ uint log_res;
+ uint log_count;
- ASSERT(!*ipp);
xfs_itrace_entry(dp);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
if (error)
return error;
- dm_event_sent = 1;
}
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- /* Return through std_return after this point. */
-
- udqp = gdqp = NULL;
if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
prid = dp->i_d.di_projid;
else
- prid = (xfs_prid_t)dfltprid;
+ prid = dfltprid;
/*
* Make sure that we have allocated dquot(s) on disk.
*/
error = XFS_QM_DQVOPALLOC(mp, dp,
current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
if (error)
goto std_return;
- ip = NULL;
+ if (is_dir) {
+ rdev = 0;
+ resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+ log_res = XFS_MKDIR_LOG_RES(mp);
+ log_count = XFS_MKDIR_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+ } else {
+ resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+ log_res = XFS_CREATE_LOG_RES(mp);
+ log_count = XFS_CREATE_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+ }
- tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, resblks, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
if (error == ENOSPC) {
resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, 0, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
}
if (error) {
cancel_flags = 0;
- goto error_return;
+ goto out_trans_cancel;
}
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = B_TRUE;
- xfs_bmap_init(&free_list, &first_block);
+ /*
+ * Check for directory link count overflow.
+ */
+ if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
+ error = XFS_ERROR(EMLINK);
+ goto out_trans_cancel;
+ }
- ASSERT(ip == NULL);
+ xfs_bmap_init(&free_list, &first_block);
/*
* Reserve disk quota and the inode.
*/
error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
if (error)
- goto error_return;
+ goto out_trans_cancel;
error = xfs_dir_canenter(tp, dp, name, resblks);
if (error)
- goto error_return;
- error = xfs_dir_ialloc(&tp, dp, mode, 1,
- rdev, credp, prid, resblks > 0,
- &ip, &committed);
+ goto out_trans_cancel;
+
+ /*
+ * A newly created regular or special file just has one directory
+ * entry pointing to them, but a directory also the "." entry
+ * pointing to itself.
+ */
+ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+ prid, resblks > 0, &ip, &committed);
if (error) {
if (error == ENOSPC)
- goto error_return;
- goto abort_return;
+ goto out_trans_cancel;
+ goto out_trans_abort;
}
- xfs_itrace_ref(ip);
/*
* At this point, we've gotten a newly allocated inode.
* It is locked (and joined to the transaction).
*/
-
+ xfs_itrace_ref(ip);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
@@ -1508,19 +1526,28 @@ xfs_create(
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
ASSERT(error != ENOSPC);
- goto abort_return;
+ goto out_trans_abort;
}
xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ if (is_dir) {
+ error = xfs_dir_init(tp, ip, dp);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_bumplink(tp, dp);
+ if (error)
+ goto out_bmap_cancel;
+ }
+
/*
* If this is a synchronous mount, make sure that the
* create transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- }
/*
* Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
IHOLD(ip);
error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- xfs_bmap_cancel(&free_list);
- goto abort_rele;
- }
+ if (error)
+ goto out_abort_rele;
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
if (error) {
IRELE(ip);
- tp = NULL;
- goto error_return;
+ goto out_dqrele;
}
XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
*ipp = ip;
/* Fallthrough to std_return with error = 0 */
-
-std_return:
- if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
- DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
- dp, DM_RIGHT_NULL,
- *ipp ? ip : NULL,
- DM_RIGHT_NULL, name->name, NULL,
- mode, error, 0);
+ std_return:
+ if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
+ XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
+ ip, DM_RIGHT_NULL, name->name, NULL, mode,
+ error, 0);
}
+
return error;
- abort_return:
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ out_trans_abort:
cancel_flags |= XFS_TRANS_ABORT;
- /* FALLTHROUGH */
-
- error_return:
- if (tp != NULL)
- xfs_trans_cancel(tp, cancel_flags);
-
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_dqrele:
XFS_QM_DQRELE(mp, udqp);
XFS_QM_DQRELE(mp, gdqp);
@@ -1583,20 +1603,18 @@ std_return:
goto std_return;
- abort_rele:
+ out_abort_rele:
/*
* Wait until after the current transaction is aborted to
* release the inode. This prevents recursive transactions
* and deadlocks from xfs_inactive.
*/
+ xfs_bmap_cancel(&free_list);
cancel_flags |= XFS_TRANS_ABORT;
xfs_trans_cancel(tp, cancel_flags);
IRELE(ip);
-
- XFS_QM_DQRELE(mp, udqp);
- XFS_QM_DQRELE(mp, gdqp);
-
- goto std_return;
+ unlock_dp_on_error = B_FALSE;
+ goto out_dqrele;
}
#ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
/* Return through std_return after this point. */
error = XFS_QM_DQATTACH(mp, sip, 0);
- if (!error && sip != tdp)
- error = XFS_QM_DQATTACH(mp, tdp, 0);
+ if (error)
+ goto std_return;
+
+ error = XFS_QM_DQATTACH(mp, tdp, 0);
if (error)
goto std_return;
@@ -2110,209 +2130,6 @@ std_return:
goto std_return;
}
-
-int
-xfs_mkdir(
- xfs_inode_t *dp,
- struct xfs_name *dir_name,
- mode_t mode,
- xfs_inode_t **ipp,
- cred_t *credp)
-{
- xfs_mount_t *mp = dp->i_mount;
- xfs_inode_t *cdp; /* inode of created dir */
- xfs_trans_t *tp;
- int cancel_flags;
- int error;
- int committed;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- boolean_t unlock_dp_on_error = B_FALSE;
- boolean_t created = B_FALSE;
- int dm_event_sent = 0;
- xfs_prid_t prid;
- struct xfs_dquot *udqp, *gdqp;
- uint resblks;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- tp = NULL;
-
- if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
- dp, DM_RIGHT_NULL, NULL,
- DM_RIGHT_NULL, dir_name->name, NULL,
- mode, 0, 0);
- if (error)
- return error;
- dm_event_sent = 1;
- }
-
- /* Return through std_return after this point. */
-
- xfs_itrace_entry(dp);
-
- mp = dp->i_mount;
- udqp = gdqp = NULL;
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- prid = dp->i_d.di_projid;
- else
- prid = (xfs_prid_t)dfltprid;
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = XFS_QM_DQVOPALLOC(mp, dp,
- current_fsuid(), current_fsgid(), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
- if (error)
- goto std_return;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
- error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
- if (error == ENOSPC) {
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_MKDIR_LOG_COUNT);
- }
- if (error) {
- cancel_flags = 0;
- goto error_return;
- }
-
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- unlock_dp_on_error = B_TRUE;
-
- /*
- * Check for directory link count overflow.
- */
- if (dp->i_d.di_nlink >= XFS_MAXLINK) {
- error = XFS_ERROR(EMLINK);
- goto error_return;
- }
-
- /*
- * Reserve disk quota and the inode.
- */
- error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
- if (error)
- goto error_return;
-
- error = xfs_dir_canenter(tp, dp, dir_name, resblks);
- if (error)
- goto error_return;
- /*
- * create the directory inode.
- */
- error = xfs_dir_ialloc(&tp, dp, mode, 2,
- 0, credp, prid, resblks > 0,
- &cdp, NULL);
- if (error) {
- if (error == ENOSPC)
- goto error_return;
- goto abort_return;
- }
- xfs_itrace_ref(cdp);
-
- /*
- * Now we add the directory inode to the transaction.
- * We waited until now since xfs_dir_ialloc might start
- * a new transaction. Had we joined the transaction
- * earlier, the locks might have gotten released. An error
- * from here on will result in the transaction cancel
- * unlocking dp so don't do it explicitly in the error path.
- */
- IHOLD(dp);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- unlock_dp_on_error = B_FALSE;
-
- xfs_bmap_init(&free_list, &first_block);
-
- error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
- &first_block, &free_list, resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
- if (error) {
- ASSERT(error != ENOSPC);
- goto error1;
- }
- xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- error = xfs_dir_init(tp, cdp, dp);
- if (error)
- goto error2;
-
- error = xfs_bumplink(tp, dp);
- if (error)
- goto error2;
-
- created = B_TRUE;
-
- *ipp = cdp;
- IHOLD(cdp);
-
- /*
- * Attach the dquots to the new inode and modify the icount incore.
- */
- XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
-
- /*
- * If this is a synchronous mount, make sure that the
- * mkdir transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- IRELE(cdp);
- goto error2;
- }
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- XFS_QM_DQRELE(mp, udqp);
- XFS_QM_DQRELE(mp, gdqp);
- if (error) {
- IRELE(cdp);
- }
-
- /* Fall through to std_return with error = 0 or errno from
- * xfs_trans_commit. */
-
-std_return:
- if ((created || (error != 0 && dm_event_sent != 0)) &&
- DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
- dp, DM_RIGHT_NULL,
- created ? cdp : NULL,
- DM_RIGHT_NULL,
- dir_name->name, NULL,
- mode, error, 0);
- }
- return error;
-
- error2:
- error1:
- xfs_bmap_cancel(&free_list);
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
- xfs_trans_cancel(tp, cancel_flags);
- XFS_QM_DQRELE(mp, udqp);
- XFS_QM_DQRELE(mp, gdqp);
-
- if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- goto std_return;
-}
-
int
xfs_symlink(
xfs_inode_t *dp,
@@ -2587,51 +2404,6 @@ std_return:
}
int
-xfs_inode_flush(
- xfs_inode_t *ip,
- int flags)
-{
- xfs_mount_t *mp = ip->i_mount;
- int error = 0;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- /*
- * Bypass inodes which have already been cleaned by
- * the inode flush clustering code inside xfs_iflush
- */
- if (xfs_inode_clean(ip))
- return 0;
-
- /*
- * We make this non-blocking if the inode is contended,
- * return EAGAIN to indicate to the caller that they
- * did not succeed. This prevents the flush path from
- * blocking on inodes inside another operation right
- * now, they get caught later by xfs_sync.
- */
- if (flags & FLUSH_SYNC) {
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- xfs_iflock(ip);
- } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
- if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return EAGAIN;
- }
- } else {
- return EAGAIN;
- }
-
- error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
- : XFS_IFLUSH_ASYNC_NOBLOCK);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- return error;
-}
-
-
-int
xfs_set_dmattrs(
xfs_inode_t *ip,
u_int evmask,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
ASSERT(!VN_MAPPED(VFS_I(ip)));
/* bad inode, get out here ASAP */
- if (VN_BAD(VFS_I(ip))) {
+ if (is_bad_inode(VFS_I(ip))) {
xfs_ireclaim(ip);
return 0;
}
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
/*
* Need to zero the stuff we're not freeing, on disk.
- * If its a realtime file & can't use unwritten extents then we
+ * If it's a realtime file & can't use unwritten extents then we
* actually need to zero the extent edges. Otherwise xfs_bunmapi
* will take care of it for us.
*/
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 76df328c61b..04373c6c61f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
struct xfs_name *target_name);
-int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
- mode_t mode, struct xfs_inode **ipp, cred_t *credp);
int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
xfs_off_t *offset, filldir_t filldir);
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, mode_t mode, struct xfs_inode **ipp,
cred_t *credp);
-int xfs_inode_flush(struct xfs_inode *ip, int flags);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,